diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000..1a85a480c --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,12 @@ +Checks: '-*,readability-identifier-naming' +HeaderFilterRegex: '.*' +WarningsAsErrors: '*' +CheckOptions: + - key: readability-identifier-naming.StructCase + value: lower_case + - key: readability-identifier-naming.UnionCase + value: lower_case + - key: readability-identifier-naming.FunctionCase + value: lower_case + - key: readability-identifier-naming.TypedefCase + value: lower_case diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 40957eef4..c1c1f34ff 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -22,43 +22,33 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Setup dependencies + + - name: Set up dependencies run: | sudo apt update sudo apt install -y lcov libsqlite3-dev liblz4-dev libuv1-dev - - name: Build raft - env: - CC: ${{ matrix.compiler }} - run: | - git clone https://github.com/canonical/raft.git --depth 1 - cd raft - autoreconf -i - ./configure --enable-debug --enable-sanitize - make -j4 - sudo make install - sudo ldconfig - cd .. - - name: Build dqlite env: CC: ${{ matrix.compiler }} run: | autoreconf -i - ./configure --enable-debug --enable-code-coverage --enable-sanitize - make CFLAGS=-O0 -j2 + ./configure --enable-debug --enable-code-coverage --enable-sanitize --enable-build-raft + make -j4 - name: Test env: CC: ${{ matrix.compiler }} run: | export ${{ matrix.tracing }} - make CFLAGS=-O0 -j2 check || (cat ./test-suite.log && false) + make -j4 check || (cat ./test-suite.log && false) - name: Coverage env: CC: ${{ matrix.compiler }} - run: if [ "${CC}" = "gcc" ]; then make code-coverage-capture; fi + if: ${{ matrix.compiler == 'gcc' }} + run: | + make code-coverage-capture - name: Upload coverage to Codecov uses: codecov/codecov-action@v4 diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml index a5cf901a8..69e1d4fdf 100644 --- a/.github/workflows/downstream.yml +++ b/.github/workflows/downstream.yml @@ -28,31 +28,17 @@ jobs: sudo make -j$(nproc) install sudo ldconfig - - name: Check out raft - uses: actions/checkout@v4 - with: - repository: canonical/raft - path: raft - - - name: Install raft - run: | - cd raft - autoreconf -i - ./configure --enable-debug --enable-uv --enable-sanitize --enable-backtrace - sudo make -j$(nproc) install - sudo ldconfig - - name: Check out dqlite uses: actions/checkout@v4 with: - ref: refs/pull/${{ github.event.issue.number }}/head + ref: refs/pull/${{ github.event.issue.number }}/merge path: dqlite - name: Install dqlite run: | cd dqlite autoreconf -i - ./configure --enable-debug --enable-sanitize --enable-backtrace + ./configure --enable-debug --enable-sanitize --enable-backtrace --enable-build-raft sudo make -j$(nproc) sudo make install sudo ldconfig diff --git a/.github/workflows/external-raft.yml b/.github/workflows/external-raft.yml new file mode 100644 index 000000000..da23836f8 --- /dev/null +++ b/.github/workflows/external-raft.yml @@ -0,0 +1,38 @@ +name: CI Tests (external libraft) + +on: + - push + - pull_request + +jobs: + build-and-test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Setup dependencies + run: | + sudo apt update + sudo apt install -y libsqlite3-dev liblz4-dev libuv1-dev + + - name: Build raft + run: | + git clone https://github.com/canonical/raft --depth 1 + cd raft + autoreconf -i + ./configure --enable-debug --enable-sanitize + make -j4 + sudo make install + sudo ldconfig + + - name: Build dqlite + run: | + autoreconf -i + ./configure --enable-debug --enable-sanitize + make -j4 + + - name: Test + run: | + export LIBRAFT_TRACE=1 LIBDQLITE_TRACE=1 + make -j4 check || (cat ./test-suite.log && false) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 3d9639c11..7f016d2f2 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -10,10 +10,20 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: DoozyX/clang-format-lint-action@v0.16 with: - source: 'src test include' - exclude: 'test/lib/munit.*' - extensions: 'c,h' - clangFormatVersion: 14 - style: file + fetch-depth: 2 + - name: Install apt dependencies + run: | + sudo apt update + sudo apt install -y libsqlite3-dev liblz4-dev libuv1-dev bear + - uses: KyleMayes/install-llvm-action@master + with: + version: 17 + - name: Run clang-format + run: | + find . \( -name '*.c' -or -name '*.h' \) -not -name 'munit.*' -path ./llvm -prune | xargs ./llvm/bin/clang-format --style=file --dry-run -Werror + - name: Run clang-tidy + run: | + shopt -s globstar + bear -- cc -D_GNU_SOURCE -DHAVE_LINUX_AIO_ABI_H -c {src,test}/**/*.c + git show -U0 --first-parent | ./clang-tidy-diff.py -p1 -config-file=.clang-tidy -clang-tidy-binary=./llvm/bin/clang-tidy -use-color diff --git a/Makefile.am b/Makefile.am index 8c8cb2581..ee5751e0d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,18 +1,33 @@ ACLOCAL_AMFLAGS = -I m4 AM_CFLAGS += $(CODE_COVERAGE_CFLAGS) -AM_CFLAGS += $(SQLITE_CFLAGS) $(UV_CFLAGS) $(RAFT_CFLAGS) $(PTHREAD_CFLAGS) -AM_LDFLAGS = $(UV_LIBS) $(RAFT_LIBS) $(PTHREAD_LIBS) +AM_CFLAGS += $(SQLITE_CFLAGS) $(UV_CFLAGS) $(PTHREAD_CFLAGS) +AM_LDFLAGS = $(UV_LIBS) $(PTHREAD_LIBS) if !BUILD_SQLITE_ENABLED AM_LDFLAGS += $(SQLITE_LIBS) endif +if !BUILD_RAFT_ENABLED +AM_CFLAGS += $(RAFT_CFLAGS) +AM_LDFLAGS += $(RAFT_LIBS) +endif + +if DEBUG_ENABLED + AM_CFLAGS += -g3 +else + AM_CFLAGS += -O2 +endif +if SANITIZE_ENABLED + AM_CFLAGS += -fsanitize=address +endif +if BACKTRACE_ENABLED + AM_CFLAGS += -DDQLITE_ASSERT_WITH_BACKTRACE -DRAFT_ASSERT_WITH_BACKTRACE + AM_LDFLAGS += -lbacktrace +endif + include_HEADERS = include/dqlite.h -lib_LTLIBRARIES = libdqlite.la -libdqlite_la_CFLAGS = $(AM_CFLAGS) -fvisibility=hidden -libdqlite_la_LDFLAGS = $(AM_LDFLAGS) -version-info 0:1:0 -libdqlite_la_SOURCES = \ +basic_dqlite_sources = \ src/bind.c \ src/client/protocol.c \ src/command.c \ @@ -46,14 +61,72 @@ libdqlite_la_SOURCES = \ src/tuple.c \ src/vfs.c +lib_LTLIBRARIES = libdqlite.la +libdqlite_la_CFLAGS = $(AM_CFLAGS) -fvisibility=hidden -DRAFT_API='' +libdqlite_la_LDFLAGS = $(AM_LDFLAGS) -version-info 0:1:0 +libdqlite_la_SOURCES = $(basic_dqlite_sources) + if BUILD_SQLITE_ENABLED libdqlite_la_SOURCES += sqlite3.c endif -check_PROGRAMS = \ - unit-test \ - integration-test -TESTS = unit-test integration-test +if BUILD_RAFT_ENABLED +libraft_la_SOURCES = \ + src/raft/byte.c \ + src/raft/callbacks.c \ + src/raft/client.c \ + src/raft/compress.c \ + src/raft/configuration.c \ + src/raft/convert.c \ + src/raft/election.c \ + src/raft/entry.c \ + src/raft/err.c \ + src/raft/fixture.c \ + src/raft/flags.c \ + src/raft/heap.c \ + src/raft/lifecycle.c \ + src/raft/log.c \ + src/raft/membership.c \ + src/raft/progress.c \ + src/raft/raft.c \ + src/raft/recv.c \ + src/raft/recv_append_entries.c \ + src/raft/recv_append_entries_result.c \ + src/raft/recv_request_vote.c \ + src/raft/recv_request_vote_result.c \ + src/raft/recv_install_snapshot.c \ + src/raft/recv_timeout_now.c \ + src/raft/replication.c \ + src/raft/snapshot.c \ + src/raft/start.c \ + src/raft/state.c \ + src/raft/syscall.c \ + src/raft/tick.c \ + src/raft/uv.c \ + src/raft/uv_append.c \ + src/raft/uv_encoding.c \ + src/raft/uv_finalize.c \ + src/raft/uv_fs.c \ + src/raft/uv_ip.c \ + src/raft/uv_list.c \ + src/raft/uv_metadata.c \ + src/raft/uv_os.c \ + src/raft/uv_prepare.c \ + src/raft/uv_recv.c \ + src/raft/uv_segment.c \ + src/raft/uv_send.c \ + src/raft/uv_snapshot.c \ + src/raft/uv_tcp.c \ + src/raft/uv_tcp_listen.c \ + src/raft/uv_tcp_connect.c \ + src/raft/uv_truncate.c \ + src/raft/uv_work.c \ + src/raft/uv_writer.c + +libdqlite_la_SOURCES += $(libraft_la_SOURCES) +endif # BUILD_RAFT_ENABLED + +check_PROGRAMS = unit-test integration-test check_LTLIBRARIES = libtest.la @@ -70,7 +143,7 @@ libtest_la_SOURCES = \ test/lib/sqlite.c \ test/lib/uv.c -unit_test_SOURCES = $(libdqlite_la_SOURCES) +unit_test_SOURCES = $(basic_dqlite_sources) unit_test_SOURCES += \ test/test_error.c \ test/test_integration.c \ @@ -96,6 +169,10 @@ unit_test_CFLAGS = $(AM_CFLAGS) -Wno-unknown-warning-option -Wno-uninitialized - unit_test_LDFLAGS = $(AM_LDFLAGS) unit_test_LDADD = libtest.la +if BUILD_RAFT_ENABLED +unit_test_LDADD += libraft.la +endif + integration_test_SOURCES = \ test/integration/test_client.c \ test/integration/test_cluster.c \ @@ -110,19 +187,148 @@ integration_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion integration_test_LDFLAGS = $(AM_LDFLAGS) -no-install integration_test_LDADD = libtest.la libdqlite.la -if DEBUG_ENABLED - AM_CFLAGS += -g3 -else - AM_CFLAGS += -O2 -endif -if SANITIZE_ENABLED - AM_CFLAGS += -fsanitize=address +if BUILD_RAFT_ENABLED +check_LTLIBRARIES += libraft.la + +check_PROGRAMS += \ + raft-core-unit-test \ + raft-core-integration-test \ + raft-uv-unit-test \ + raft-uv-integration-test \ + raft-core-fuzzy-test + +libtest_la_SOURCES += \ + test/raft/lib/addrinfo.c \ + test/raft/lib/fault.c \ + test/raft/lib/fsm.c \ + test/raft/lib/heap.c \ + test/raft/lib/munit.c \ + test/raft/lib/tcp.c \ + test/raft/lib/cluster.c \ + test/raft/lib/aio.c \ + test/raft/lib/dir.c \ + test/raft/lib/tcp.c \ + test/raft/lib/loop.c + +libraft_la_CFLAGS = $(AM_CFLAGS) +libraft_la_LDFLAGS = $(UV_LIBS) + +raft_core_unit_test_SOURCES = \ + src/tracing.c \ + src/raft/byte.c \ + src/raft/compress.c \ + src/raft/configuration.c \ + src/raft/err.c \ + src/raft/flags.c \ + src/raft/heap.c \ + src/raft/log.c \ + test/raft/unit/main_core.c \ + test/raft/unit/test_byte.c \ + test/raft/unit/test_compress.c \ + test/raft/unit/test_configuration.c \ + test/raft/unit/test_err.c \ + test/raft/unit/test_flags.c \ + test/raft/unit/test_log.c \ + test/raft/unit/test_queue.c +raft_core_unit_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion +raft_core_unit_test_LDADD = libtest.la + +raft_core_integration_test_SOURCES = \ + src/tracing.c \ + test/raft/integration/main_core.c \ + test/raft/integration/test_apply.c \ + test/raft/integration/test_assign.c \ + test/raft/integration/test_barrier.c \ + test/raft/integration/test_bootstrap.c \ + test/raft/integration/test_digest.c \ + test/raft/integration/test_election.c \ + test/raft/integration/test_fixture.c \ + test/raft/integration/test_heap.c \ + test/raft/integration/test_init.c \ + test/raft/integration/test_membership.c \ + test/raft/integration/test_recover.c \ + test/raft/integration/test_replication.c \ + test/raft/integration/test_snapshot.c \ + test/raft/integration/test_start.c \ + test/raft/integration/test_strerror.c \ + test/raft/integration/test_tick.c \ + test/raft/integration/test_transfer.c \ + test/raft/integration/test_voter_contacts.c +raft_core_integration_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion +raft_core_integration_test_LDFLAGS = -no-install +raft_core_integration_test_LDADD = libtest.la libraft.la + +raft_core_fuzzy_test_SOURCES = \ + src/tracing.c \ + test/raft/fuzzy/main_core.c \ + test/raft/fuzzy/test_election.c \ + test/raft/fuzzy/test_liveness.c \ + test/raft/fuzzy/test_membership.c \ + test/raft/fuzzy/test_replication.c +raft_core_fuzzy_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion +raft_core_fuzzy_test_LDFLAGS = -no-install +raft_core_fuzzy_test_LDADD = libtest.la libraft.la + +raft_uv_unit_test_SOURCES = \ + src/raft/err.c \ + src/raft/heap.c \ + src/raft/syscall.c \ + src/raft/uv_fs.c \ + src/raft/uv_os.c \ + src/raft/uv_writer.c \ + test/raft/unit/main_uv.c \ + test/raft/unit/test_uv_fs.c \ + test/raft/unit/test_uv_os.c \ + test/raft/unit/test_uv_writer.c +raft_uv_unit_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion +raft_uv_unit_test_LDADD = libtest.la $(UV_LIBS) + +# The integration/uv test is not linked to libraft, but built +# directly against the libraft sources in order to test some +# non-visible, non-API functions. +raft_uv_integration_test_SOURCES = \ + $(libraft_la_SOURCES) \ + src/tracing.c \ + test/raft/integration/main_uv.c \ + test/raft/integration/test_uv_init.c \ + test/raft/integration/test_uv_append.c \ + test/raft/integration/test_uv_bootstrap.c \ + test/raft/integration/test_uv_load.c \ + test/raft/integration/test_uv_recover.c \ + test/raft/integration/test_uv_recv.c \ + test/raft/integration/test_uv_send.c \ + test/raft/integration/test_uv_set_term.c \ + test/raft/integration/test_uv_tcp_connect.c \ + test/raft/integration/test_uv_tcp_listen.c \ + test/raft/integration/test_uv_snapshot_put.c \ + test/raft/integration/test_uv_truncate.c \ + test/raft/integration/test_uv_truncate_snapshot.c \ + test/raft/integration/test_uv_work.c +raft_uv_integration_test_CFLAGS = $(AM_CFLAGS) -Wno-type-limits -Wno-conversion +raft_uv_integration_test_LDFLAGS = -no-install +raft_uv_integration_test_LDADD = libtest.la $(UV_LIBS) + +if LZ4_AVAILABLE +libdqlite_la_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS) +libdqlite_la_LDFLAGS += $(LZ4_LIBS) +raft_core_unit_test_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS) +raft_core_unit_test_LDFLAGS = $(LZ4_LIBS) +libraft_la_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS) +libraft_la_LDFLAGS += $(LZ4_LIBS) +raft_uv_integration_test_CFLAGS += -DLZ4_AVAILABLE +raft_uv_integration_test_LDFLAGS += $(LZ4_LIBS) endif -if BACKTRACE_ENABLED - AM_CFLAGS += -DDQLITE_ASSERT_WITH_BACKTRACE - AM_LDFLAGS += -lbacktrace +if LZ4_ENABLED +libdqlite_la_CFLAGS += -DLZ4_ENABLED +raft_uv_integration_test_CFLAGS += -DLZ4_ENABLED +raft_core_unit_test_CFLAGS += -DLZ4_ENABLED +libraft_la_CFLAGS += -DLZ4_ENABLED endif +endif # BUILD_RAFT_ENABLED + +TESTS = $(check_PROGRAMS) + if CODE_COVERAGE_ENABLED include $(top_srcdir)/aminclude_static.am diff --git a/README.md b/README.md index 52e68abcb..5a26f5185 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,13 @@ dqlite [![CI Tests](https://github.com/canonical/dqlite/actions/workflows/build- [English](./README.md)|[简体中文](./README_CH.md) -[dqlite](https://dqlite.io) is a C library that implements an embeddable and replicated SQL database -engine with high availability and automatic failover. +[dqlite](https://dqlite.io) is a C library that implements an embeddable and +replicated SQL database engine with high availability and automatic failover. -The acronym "dqlite" stands for "distributed SQLite", meaning that dqlite extends -[SQLite](https://sqlite.org/) with a network protocol that can connect together -various instances of your application and have them act as a highly-available -cluster, with no dependency on external databases. +The acronym "dqlite" stands for "distributed SQLite", meaning that dqlite +extends [SQLite](https://sqlite.org/) with a network protocol that can connect +together various instances of your application and have them act as a +highly-available cluster, with no dependency on external databases. Design highlights ---------------- @@ -17,24 +17,23 @@ Design highlights * Asynchronous single-threaded implementation using [libuv](https://libuv.org/) as event loop. * Custom wire protocol optimized for SQLite primitives and data types. -* Data replication based on the [Raft](https://raft.github.io/) algorithm and its - efficient [C-raft](https://github.com/canonical/raft) implementation. +* Data replication based on the [Raft](https://raft.github.io/) algorithm. License ------- -The dqlite library is released under a slightly modified version of LGPLv3, that -includes a copyright exception allowing users to statically link the library code -in their project and release the final work under their own terms. See the full -[license](https://github.com/canonical/dqlite/blob/master/LICENSE) text. +The dqlite library is released under a slightly modified version of LGPLv3, +that includes a copyright exception allowing users to statically link the +library code in their project and release the final work under their own terms. +See the full [license](https://github.com/canonical/dqlite/blob/master/LICENSE) +text. Compatibility ------------- dqlite runs on Linux and requires a kernel with support for [native async I/O](https://man7.org/linux/man-pages/man2/io_setup.2.html) (not to be confused -with [POSIX AIO](https://man7.org/linux/man-pages/man7/aio.7.html)), which is -used by the libuv backend of C-raft. +with [POSIX AIO](https://man7.org/linux/man-pages/man7/aio.7.html)). Try it ------- @@ -49,24 +48,26 @@ Media A talk about dqlite was given at FOSDEM 2020, you can watch it [here](https://fosdem.org/2020/schedule/event/dqlite/). -[Here](https://gcore.com/blog/comparing-litestream-rqlite-dqlite/) is a blog post from 2022 comparing dqlite with rqlite and Litestream, other replication software for SQLite. +[Here](https://gcore.com/blog/comparing-litestream-rqlite-dqlite/) is a blog +post from 2022 comparing dqlite with rqlite and Litestream, other replication +software for SQLite. Wire protocol ------------- -If you wish to write a client, please refer to the [wire protocol](https://dqlite.io/docs/protocol) -documentation. +If you wish to write a client, please refer to the [wire +protocol](https://dqlite.io/docs/protocol) documentation. Install ------- -If you are on a Debian-based system, you can get the latest development release from -dqlite's [dev PPA](https://launchpad.net/~dqlite/+archive/ubuntu/dev): +If you are on a Debian-based system, you can get the latest development release +from dqlite's [dev PPA](https://launchpad.net/~dqlite/+archive/ubuntu/dev): ``` sudo add-apt-repository ppa:dqlite/dev -sudo apt-get update -sudo apt-get install libdqlite-dev +sudo apt update +sudo apt install libdqlite-dev ``` Build @@ -74,45 +75,50 @@ Build To build libdqlite from source you'll need: -* A reasonably recent version of [libuv](http://libuv.org/) (v1.8.0 or beyond). -* A reasonably recent version of sqlite3-dev -* A build of the [C-raft](https://github.com/canonical/raft) Raft library. +* Build dependencies: pkg-config and GNU Autoconf, Automake, libtool, and make +* A reasonably recent version of [libuv](https://libuv.org/) (v1.8.0 or later), with headers. +* A reasonably recent version of [SQLite](https://sqlite.org/) (v3.22.0 or later), with headers. +* Optionally, a reasonably recent version of [LZ4](https://lz4.org/) (v1.7.1 or later), with headers. -Your distribution should already provide you with a pre-built libuv shared -library and libsqlite3-dev. +Your distribution should already provide you with these dependencies. For +example, on Debian-based distros: -For the Debian-based Linux distros you can install the build dependencies with: +``` +sudo apt install pkg-config autoconf automake libtool make libuv1-dev libsqlite3-dev liblz4-dev +``` + +With these dependencies installed, you can build and install the dqlite shared +library and headers as follows: ``` -sudo apt install autoconf libuv1-dev liblz4-dev libtool pkg-config build-essential libsqlite3-dev +$ autoreconf -i +$ ./configure --enable-build-raft +$ make +$ sudo make install ``` -To build the raft library: +The default installation prefix is `/usr/local`; you may need to run ``` -git clone https://github.com/canonical/raft.git -cd raft -autoreconf -i -./configure -make -sudo make install -cd .. +$ sudo ldconfig ``` -Once all the required libraries are installed, in order to build the dqlite -shared library itself, you can run: +to enable the linker to find `libdqlite.so`. To install to a different prefix, +replace the configure step with something like ``` -autoreconf -i -./configure -make -sudo make install +$ ./configure --enable-build-raft --prefix=/usr ``` +The `--enable-build-raft` option causes dqlite to use its bundled Raft +implementation instead of linking to an external libraft; the latter is a +legacy configuration that should not be used for new development. + Usage Notes ----------- -Detailed tracing will be enabled when the environment variable `LIBDQLITE_TRACE` is set before startup. -The value of it can be in `[0..5]` range and reperesents a tracing level, where -`0` means "no traces" emitted, `5` enables minimum (FATAL records only), and `1` -enables maximum verbosity (all: DEBUG, INFO, WARN, ERROR, FATAL records). +Detailed tracing will be enabled when the environment variable +`LIBDQLITE_TRACE` is set before startup. The value of it can be in `[0..5]` +range and reperesents a tracing level, where `0` means "no traces" emitted, `5` +enables minimum (FATAL records only), and `1` enables maximum verbosity (all: +DEBUG, INFO, WARN, ERROR, FATAL records). diff --git a/clang-tidy-diff.py b/clang-tidy-diff.py new file mode 100755 index 000000000..d96b3450f --- /dev/null +++ b/clang-tidy-diff.py @@ -0,0 +1,382 @@ +#!/usr/bin/env python3 +# +# ===- clang-tidy-diff.py - ClangTidy Diff Checker -----------*- python -*--===# +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===-----------------------------------------------------------------------===# + +r""" +ClangTidy Diff Checker +====================== + +This script reads input from a unified diff, runs clang-tidy on all changed +files and outputs clang-tidy warnings in changed lines only. This is useful to +detect clang-tidy regressions in the lines touched by a specific patch. +Example usage for git/svn users: + + git diff -U0 HEAD^ | clang-tidy-diff.py -p1 + svn diff --diff-cmd=diff -x-U0 | \ + clang-tidy-diff.py -fix -checks=-*,modernize-use-override + +""" + +import argparse +import glob +import json +import multiprocessing +import os +import re +import shutil +import subprocess +import sys +import tempfile +import threading +import traceback + +try: + import yaml +except ImportError: + yaml = None + +is_py2 = sys.version[0] == "2" + +if is_py2: + import Queue as queue +else: + import queue as queue + + +def run_tidy(task_queue, lock, timeout, failed_files): + watchdog = None + while True: + command = task_queue.get() + try: + proc = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + if timeout is not None: + watchdog = threading.Timer(timeout, proc.kill) + watchdog.start() + + stdout, stderr = proc.communicate() + if proc.returncode != 0: + if proc.returncode < 0: + msg = "Terminated by signal %d : %s\n" % ( + -proc.returncode, + " ".join(command), + ) + stderr += msg.encode("utf-8") + failed_files.append(command) + + with lock: + sys.stdout.write(stdout.decode("utf-8") + "\n") + sys.stdout.flush() + if stderr: + sys.stderr.write(stderr.decode("utf-8") + "\n") + sys.stderr.flush() + except Exception as e: + with lock: + sys.stderr.write("Failed: " + str(e) + ": ".join(command) + "\n") + finally: + with lock: + if not (timeout is None or watchdog is None): + if not watchdog.is_alive(): + sys.stderr.write( + "Terminated by timeout: " + " ".join(command) + "\n" + ) + watchdog.cancel() + task_queue.task_done() + + +def start_workers(max_tasks, tidy_caller, arguments): + for _ in range(max_tasks): + t = threading.Thread(target=tidy_caller, args=arguments) + t.daemon = True + t.start() + + +def merge_replacement_files(tmpdir, mergefile): + """Merge all replacement files in a directory into a single file""" + # The fixes suggested by clang-tidy >= 4.0.0 are given under + # the top level key 'Diagnostics' in the output yaml files + mergekey = "Diagnostics" + merged = [] + for replacefile in glob.iglob(os.path.join(tmpdir, "*.yaml")): + content = yaml.safe_load(open(replacefile, "r")) + if not content: + continue # Skip empty files. + merged.extend(content.get(mergekey, [])) + + if merged: + # MainSourceFile: The key is required by the definition inside + # include/clang/Tooling/ReplacementsYaml.h, but the value + # is actually never used inside clang-apply-replacements, + # so we set it to '' here. + output = {"MainSourceFile": "", mergekey: merged} + with open(mergefile, "w") as out: + yaml.safe_dump(output, out) + else: + # Empty the file: + open(mergefile, "w").close() + + +def main(): + parser = argparse.ArgumentParser( + description="Run clang-tidy against changed files, and " + "output diagnostics only for modified " + "lines." + ) + parser.add_argument( + "-clang-tidy-binary", + metavar="PATH", + default="clang-tidy", + help="path to clang-tidy binary", + ) + parser.add_argument( + "-p", + metavar="NUM", + default=0, + help="strip the smallest prefix containing P slashes", + ) + parser.add_argument( + "-regex", + metavar="PATTERN", + default=None, + help="custom pattern selecting file paths to check " + "(case sensitive, overrides -iregex)", + ) + parser.add_argument( + "-iregex", + metavar="PATTERN", + default=r".*\.(cpp|cc|c\+\+|cxx|c|cl|h|hpp|m|mm|inc)", + help="custom pattern selecting file paths to check " + "(case insensitive, overridden by -regex)", + ) + parser.add_argument( + "-j", + type=int, + default=1, + help="number of tidy instances to be run in parallel.", + ) + parser.add_argument( + "-timeout", type=int, default=None, help="timeout per each file in seconds." + ) + parser.add_argument( + "-fix", action="store_true", default=False, help="apply suggested fixes" + ) + parser.add_argument( + "-checks", + help="checks filter, when not specified, use clang-tidy " "default", + default="", + ) + parser.add_argument( + "-config-file", + dest="config_file", + help="Specify the path of .clang-tidy or custom config file", + default="", + ) + parser.add_argument("-use-color", action="store_true", help="Use colors in output") + parser.add_argument( + "-path", dest="build_path", help="Path used to read a compile command database." + ) + if yaml: + parser.add_argument( + "-export-fixes", + metavar="FILE_OR_DIRECTORY", + dest="export_fixes", + help="A directory or a yaml file to store suggested fixes in, " + "which can be applied with clang-apply-replacements. If the " + "parameter is a directory, the fixes of each compilation unit are " + "stored in individual yaml files in the directory.", + ) + else: + parser.add_argument( + "-export-fixes", + metavar="DIRECTORY", + dest="export_fixes", + help="A directory to store suggested fixes in, which can be applied " + "with clang-apply-replacements. The fixes of each compilation unit are " + "stored in individual yaml files in the directory.", + ) + parser.add_argument( + "-extra-arg", + dest="extra_arg", + action="append", + default=[], + help="Additional argument to append to the compiler " "command line.", + ) + parser.add_argument( + "-extra-arg-before", + dest="extra_arg_before", + action="append", + default=[], + help="Additional argument to prepend to the compiler " "command line.", + ) + parser.add_argument( + "-quiet", + action="store_true", + default=False, + help="Run clang-tidy in quiet mode", + ) + parser.add_argument( + "-load", + dest="plugins", + action="append", + default=[], + help="Load the specified plugin in clang-tidy.", + ) + + clang_tidy_args = [] + argv = sys.argv[1:] + if "--" in argv: + clang_tidy_args.extend(argv[argv.index("--") :]) + argv = argv[: argv.index("--")] + + args = parser.parse_args(argv) + + # Extract changed lines for each file. + filename = None + lines_by_file = {} + for line in sys.stdin: + match = re.search('^\+\+\+\ "?(.*?/){%s}([^ \t\n"]*)' % args.p, line) + if match: + filename = match.group(2) + if filename is None: + continue + + if args.regex is not None: + if not re.match("^%s$" % args.regex, filename): + continue + else: + if not re.match("^%s$" % args.iregex, filename, re.IGNORECASE): + continue + + match = re.search("^@@.*\+(\d+)(,(\d+))?", line) + if match: + start_line = int(match.group(1)) + line_count = 1 + if match.group(3): + line_count = int(match.group(3)) + if line_count == 0: + continue + end_line = start_line + line_count - 1 + lines_by_file.setdefault(filename, []).append([start_line, end_line]) + + if not any(lines_by_file): + print("No relevant changes found.") + sys.exit(0) + + max_task_count = args.j + if max_task_count == 0: + max_task_count = multiprocessing.cpu_count() + max_task_count = min(len(lines_by_file), max_task_count) + + combine_fixes = False + export_fixes_dir = None + delete_fixes_dir = False + if args.export_fixes is not None: + # if a directory is given, create it if it does not exist + if args.export_fixes.endswith(os.path.sep) and not os.path.isdir( + args.export_fixes + ): + os.makedirs(args.export_fixes) + + if not os.path.isdir(args.export_fixes): + if not yaml: + raise RuntimeError( + "Cannot combine fixes in one yaml file. Either install PyYAML or specify an output directory." + ) + + combine_fixes = True + + if os.path.isdir(args.export_fixes): + export_fixes_dir = args.export_fixes + + if combine_fixes: + export_fixes_dir = tempfile.mkdtemp() + delete_fixes_dir = True + + # Tasks for clang-tidy. + task_queue = queue.Queue(max_task_count) + # A lock for console output. + lock = threading.Lock() + + # List of files with a non-zero return code. + failed_files = [] + + # Run a pool of clang-tidy workers. + start_workers( + max_task_count, run_tidy, (task_queue, lock, args.timeout, failed_files) + ) + + # Form the common args list. + common_clang_tidy_args = [] + if args.fix: + common_clang_tidy_args.append("-fix") + if args.checks != "": + common_clang_tidy_args.append("-checks=" + args.checks) + if args.config_file != "": + common_clang_tidy_args.append("-config-file=" + args.config_file) + if args.quiet: + common_clang_tidy_args.append("-quiet") + if args.build_path is not None: + common_clang_tidy_args.append("-p=%s" % args.build_path) + if args.use_color: + common_clang_tidy_args.append("--use-color") + for arg in args.extra_arg: + common_clang_tidy_args.append("-extra-arg=%s" % arg) + for arg in args.extra_arg_before: + common_clang_tidy_args.append("-extra-arg-before=%s" % arg) + for plugin in args.plugins: + common_clang_tidy_args.append("-load=%s" % plugin) + + for name in lines_by_file: + line_filter_json = json.dumps( + [{"name": name, "lines": lines_by_file[name]}], separators=(",", ":") + ) + + # Run clang-tidy on files containing changes. + command = [args.clang_tidy_binary] + command.append("-line-filter=" + line_filter_json) + if args.export_fixes is not None: + # Get a temporary file. We immediately close the handle so clang-tidy can + # overwrite it. + (handle, tmp_name) = tempfile.mkstemp(suffix=".yaml", dir=export_fixes_dir) + os.close(handle) + command.append("-export-fixes=" + tmp_name) + command.extend(common_clang_tidy_args) + command.append(name) + command.extend(clang_tidy_args) + + task_queue.put(command) + + # Application return code + return_code = 0 + + # Wait for all threads to be done. + task_queue.join() + # Application return code + return_code = 0 + if failed_files: + return_code = 1 + + if combine_fixes: + print("Writing fixes to " + args.export_fixes + " ...") + try: + merge_replacement_files(export_fixes_dir, args.export_fixes) + except: + sys.stderr.write("Error exporting fixes.\n") + traceback.print_exc() + return_code = 1 + + if delete_fixes_dir: + shutil.rmtree(export_fixes_dir) + sys.exit(return_code) + + +if __name__ == "__main__": + main() diff --git a/configure.ac b/configure.ac index 19cbdf701..99be7875a 100644 --- a/configure.ac +++ b/configure.ac @@ -6,7 +6,7 @@ AC_CONFIG_AUX_DIR([ac]) AM_INIT_AUTOMAKE([subdir-objects -Wall -Werror -Wno-portability foreign]) AM_SILENT_RULES([yes]) -AC_PROG_CC_STDC +AC_PROG_CC AC_USE_SYSTEM_EXTENSIONS AX_PTHREAD @@ -30,21 +30,24 @@ AM_COND_IF(SANITIZE_ENABLED, AC_ARG_ENABLE(backtrace, AS_HELP_STRING([--enable-backtrace[=ARG]], [print backtrace on assertion failure [default=no]])) AM_CONDITIONAL(BACKTRACE_ENABLED, test "x$enable_backtrace" = "xyes") + AC_ARG_ENABLE(build-sqlite, AS_HELP_STRING([--enable-build-sqlite[=ARG]], [build libsqlite3 from sqlite3.c in the build root [default=no]])) AM_CONDITIONAL(BUILD_SQLITE_ENABLED, test "x$enable_build_sqlite" = "xyes") +AC_ARG_ENABLE(build-raft, AS_HELP_STRING([--enable-build-raft[=ARG]], [use the bundled raft sources instead of linking to libraft [default=no]])) +AM_CONDITIONAL(BUILD_RAFT_ENABLED, test "x$enable_build_raft" = "xyes") + +# Allow not linking to liblz4 even if it's present. +AC_ARG_WITH([lz4], AS_HELP_STRING([--without-lz4], [never link to liblz4])) + # Whether to enable code coverage. AX_CODE_COVERAGE # Checks for header files. -AC_CHECK_HEADERS([arpa/inet.h fcntl.h stdint.h stdlib.h string.h sys/socket.h unistd.h]) +AC_CHECK_HEADERS([linux/io_uring.h linux/aio_abi.h]) -# Checks for typedefs, structures, and compiler characteristics. -AC_TYPE_SIZE_T -AC_TYPE_SSIZE_T -AC_TYPE_UINT16_T -AC_TYPE_UINT32_T -AC_TYPE_UINT8_T +# Checks for library functions and definitions. +AC_CHECK_DECLS(RWF_NOWAIT, [], [AC_MSG_ERROR(Linux kernel >= 4.14 required.)], [#include ]) # Enable large file support. This is mandatory in order to interoperate with # libuv, which enables large file support by default, making the size of 'off_t' @@ -54,7 +57,17 @@ AC_SYS_LARGEFILE # Checks for libraries PKG_CHECK_MODULES(SQLITE, [sqlite3 >= 3.22.0], [], []) PKG_CHECK_MODULES(UV, [libuv >= 1.8.0], [], []) -PKG_CHECK_MODULES(RAFT, [raft >= 0.18.1], [], []) +AS_IF([test "x$enable_build_raft" != "xyes"], [PKG_CHECK_MODULES(RAFT, [raft >= 0.18.1], [], [])], []) + +AS_IF([test "x$with_lz4" != "xno"], [PKG_CHECK_MODULES(LZ4, [liblz4 >= 1.7.1], [have_lz4=yes], [have_lz4=no])], [have_lz4=no]) +AS_IF([test "x$with_lz4" != "xno" -a "x$have_lz4" = "xno"], [AC_MSG_ERROR([liblz4 required but not found])], []) +AM_CONDITIONAL(LZ4_AVAILABLE, test "x$have_lz4" = "xyes") + +AC_ARG_ENABLE(lz4, AS_HELP_STRING([--disable-lz4], [when building with lz4, do not compress snapshots by default])) +AS_IF([test "x$enable_lz4" != "x" -a "x$have_lz4" = "xno"], + [AC_MSG_ERROR([snapshot compression (either by default or not) requires liblz4])], + []) +AM_CONDITIONAL(LZ4_ENABLED, test "x$enable_lz4" != "xno" -a "x$have_lz4" = "xyes") CC_CHECK_FLAGS_APPEND([AM_CFLAGS],[CFLAGS],[ \ -std=c11 \ diff --git a/include/dqlite.h b/include/dqlite.h index ad57bd208..0b24b399f 100644 --- a/include/dqlite.h +++ b/include/dqlite.h @@ -19,8 +19,9 @@ */ #define DQLITE_EXPERIMENTAL -/* XXX */ +#ifndef DQLITE_VISIBLE_TO_TESTS #define DQLITE_VISIBLE_TO_TESTS DQLITE_API +#endif /** * Version. diff --git a/m4/ax_pthread.m4 b/m4/ax_pthread.m4 index 1598d077f..9f35d1391 100644 --- a/m4/ax_pthread.m4 +++ b/m4/ax_pthread.m4 @@ -14,20 +14,24 @@ # flags that are needed. (The user can also force certain compiler # flags/libs to be tested by setting these environment variables.) # -# Also sets PTHREAD_CC to any special C compiler that is needed for -# multi-threaded programs (defaults to the value of CC otherwise). (This -# is necessary on AIX to use the special cc_r compiler alias.) +# Also sets PTHREAD_CC and PTHREAD_CXX to any special C compiler that is +# needed for multi-threaded programs (defaults to the value of CC +# respectively CXX otherwise). (This is necessary on e.g. AIX to use the +# special cc_r/CC_r compiler alias.) # # NOTE: You are assumed to not only compile your program with these flags, # but also to link with them as well. For example, you might link with # $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS +# $PTHREAD_CXX $CXXFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS # # If you are only building threaded programs, you may wish to use these # variables in your default LIBS, CFLAGS, and CC: # # LIBS="$PTHREAD_LIBS $LIBS" # CFLAGS="$CFLAGS $PTHREAD_CFLAGS" +# CXXFLAGS="$CXXFLAGS $PTHREAD_CFLAGS" # CC="$PTHREAD_CC" +# CXX="$PTHREAD_CXX" # # In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant # has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to @@ -83,7 +87,7 @@ # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. -#serial 27 +#serial 31 AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD]) AC_DEFUN([AX_PTHREAD], [ @@ -105,6 +109,7 @@ if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then ax_pthread_save_CFLAGS="$CFLAGS" ax_pthread_save_LIBS="$LIBS" AS_IF([test "x$PTHREAD_CC" != "x"], [CC="$PTHREAD_CC"]) + AS_IF([test "x$PTHREAD_CXX" != "x"], [CXX="$PTHREAD_CXX"]) CFLAGS="$CFLAGS $PTHREAD_CFLAGS" LIBS="$PTHREAD_LIBS $LIBS" AC_MSG_CHECKING([for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS]) @@ -386,7 +391,7 @@ if test "x$ax_pthread_clang" = "xyes"; then # step ax_pthread_save_ac_link="$ac_link" ax_pthread_sed='s/conftest\.\$ac_ext/conftest.$ac_objext/g' - ax_pthread_link_step=`$as_echo "$ac_link" | sed "$ax_pthread_sed"` + ax_pthread_link_step=`AS_ECHO(["$ac_link"]) | sed "$ax_pthread_sed"` ax_pthread_2step_ac_link="($ac_compile) && (echo ==== >&5) && ($ax_pthread_link_step)" ax_pthread_save_CFLAGS="$CFLAGS" for ax_pthread_try in '' -Qunused-arguments -Wno-unused-command-line-argument unknown; do @@ -482,18 +487,28 @@ if test "x$ax_pthread_ok" = "xyes"; then [#handle absolute path differently from PATH based program lookup AS_CASE(["x$CC"], [x/*], - [AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"])], - [AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC])])]) + [ + AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"]) + AS_IF([test "x${CXX}" != "x"], [AS_IF([AS_EXECUTABLE_P([${CXX}_r])],[PTHREAD_CXX="${CXX}_r"])]) + ], + [ + AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC]) + AS_IF([test "x${CXX}" != "x"], [AC_CHECK_PROGS([PTHREAD_CXX],[${CXX}_r],[$CXX])]) + ] + ) + ]) ;; esac fi fi test -n "$PTHREAD_CC" || PTHREAD_CC="$CC" +test -n "$PTHREAD_CXX" || PTHREAD_CXX="$CXX" AC_SUBST([PTHREAD_LIBS]) AC_SUBST([PTHREAD_CFLAGS]) AC_SUBST([PTHREAD_CC]) +AC_SUBST([PTHREAD_CXX]) # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: if test "x$ax_pthread_ok" = "xyes"; then diff --git a/src/command.h b/src/command.h index f41adc6a5..4b911d778 100644 --- a/src/command.h +++ b/src/command.h @@ -5,11 +5,10 @@ #ifndef COMMAND_H_ #define COMMAND_H_ -#include - #include "../include/dqlite.h" #include "lib/serialize.h" +#include "raft.h" /* Command type codes */ enum { COMMAND_OPEN = 1, COMMAND_FRAMES, COMMAND_UNDO, COMMAND_CHECKPOINT }; diff --git a/src/conn.h b/src/conn.h index fb93c8f27..0ae2b1299 100644 --- a/src/conn.h +++ b/src/conn.h @@ -5,8 +5,6 @@ #ifndef DQLITE_CONN_H_ #define DQLITE_CONN_H_ -#include - #include "lib/buffer.h" #include "lib/queue.h" #include "lib/transport.h" @@ -14,6 +12,7 @@ #include "gateway.h" #include "id.h" #include "message.h" +#include "raft.h" /** * Callbacks. diff --git a/src/fsm.c b/src/fsm.c index 2af791254..4daa86790 100644 --- a/src/fsm.c +++ b/src/fsm.c @@ -1,10 +1,9 @@ -#include - #include "lib/assert.h" #include "lib/serialize.h" #include "command.h" #include "fsm.h" +#include "raft.h" #include "tracing.h" #include "vfs.h" diff --git a/src/fsm.h b/src/fsm.h index 5c849c186..dcb5c828d 100644 --- a/src/fsm.h +++ b/src/fsm.h @@ -5,9 +5,8 @@ #ifndef DQLITE_FSM_H_ #define DQLITE_FSM_H_ -#include - #include "config.h" +#include "raft.h" #include "registry.h" /** diff --git a/src/gateway.h b/src/gateway.h index dd07fdd59..208d42fc8 100644 --- a/src/gateway.h +++ b/src/gateway.h @@ -5,8 +5,6 @@ #ifndef DQLITE_GATEWAY_H_ #define DQLITE_GATEWAY_H_ -#include - #include "../include/dqlite.h" #include "lib/buffer.h" @@ -15,6 +13,7 @@ #include "config.h" #include "id.h" #include "leader.h" +#include "raft.h" #include "registry.h" #include "stmt.h" diff --git a/src/leader.h b/src/leader.h index 38f245967..30c541a77 100644 --- a/src/leader.h +++ b/src/leader.h @@ -5,12 +5,12 @@ #ifndef LEADER_H_ #define LEADER_H_ -#include #include #include #include "./lib/queue.h" #include "db.h" +#include "raft.h" #define SQLITE_IOERR_NOT_LEADER (SQLITE_IOERR | (40 << 8)) #define SQLITE_IOERR_LEADERSHIP_LOST (SQLITE_IOERR | (41 << 8)) diff --git a/src/lib/transport.c b/src/lib/transport.c index f833266cb..8ea086215 100644 --- a/src/lib/transport.c +++ b/src/lib/transport.c @@ -1,4 +1,4 @@ -#include +#include "../raft.h" #include "../../include/dqlite.h" diff --git a/src/logger.h b/src/logger.h index f4b38db97..a5cf4813e 100644 --- a/src/logger.h +++ b/src/logger.h @@ -1,7 +1,7 @@ #ifndef LOGGER_H_ #define LOGGER_H_ -#include +#include "raft.h" #include "../include/dqlite.h" diff --git a/src/raft.h b/src/raft.h new file mode 100644 index 000000000..7f8496c58 --- /dev/null +++ b/src/raft.h @@ -0,0 +1,1953 @@ +#ifndef RAFT_H +#define RAFT_H + +#include +#include +#include +#include +#include + +#include + +#ifndef RAFT_API +#define RAFT_API __attribute__((visibility("default"))) +#endif + +#ifndef DQLITE_VISIBLE_TO_TESTS +#define DQLITE_VISIBLE_TO_TESTS __attribute__((visibility("default"))) +#endif + +/** + * Version. + */ +#define RAFT_VERSION_MAJOR 0 +#define RAFT_VERSION_MINOR 18 +#define RAFT_VERSION_RELEASE 0 +#define RAFT_VERSION_NUMBER \ + (RAFT_VERSION_MAJOR * 100 * 100 + RAFT_VERSION_MINOR * 100 + \ + RAFT_VERSION_RELEASE) + +int raft_version_number(void); + +/** + * Error codes. + */ +enum { + RAFT_NOMEM = 1, /* Out of memory */ + RAFT_BADID, /* Server ID is not valid */ + RAFT_DUPLICATEID, /* Server ID already in use */ + RAFT_DUPLICATEADDRESS, /* Server address already in use */ + RAFT_BADROLE, /* Server role is not valid */ + RAFT_MALFORMED, + RAFT_NOTLEADER, + RAFT_LEADERSHIPLOST, + RAFT_SHUTDOWN, + RAFT_CANTBOOTSTRAP, + RAFT_CANTCHANGE, + RAFT_CORRUPT, + RAFT_CANCELED, + RAFT_NAMETOOLONG, + RAFT_TOOBIG, + RAFT_NOCONNECTION, + RAFT_BUSY, + RAFT_IOERR, /* File system or storage error */ + RAFT_NOTFOUND, /* Resource not found */ + RAFT_INVALID, /* Invalid parameter */ + RAFT_UNAUTHORIZED, /* No access to a resource */ + RAFT_NOSPACE, /* Not enough space on disk */ + RAFT_TOOMANY /* Some system or raft limit was hit */ +}; + +/** + * Size of human-readable error message buffers. + */ +#define RAFT_ERRMSG_BUF_SIZE 256 + +/** + * Return the error message describing the given error code. + */ +RAFT_API const char *raft_strerror(int errnum); + +typedef unsigned long long raft_id; + +/** + * Hold the value of a raft term. Guaranteed to be at least 64-bit long. + */ +typedef unsigned long long raft_term; + +/** + * Hold the value of a raft entry index. Guaranteed to be at least 64-bit long. + */ +typedef unsigned long long raft_index; + +/** + * Hold a time value expressed in milliseconds since the epoch. + */ +typedef unsigned long long raft_time; + +/** + * Hold the features a raft node is capable of. + */ +typedef uint64_t raft_flags; + +/** + * A data buffer. + */ +struct raft_buffer +{ + void *base; /* Pointer to the buffer data. */ + size_t len; /* Length of the buffer. */ +}; + +/** + * Server role codes. + */ +enum { + RAFT_STANDBY, /* Replicate log, does not participate in quorum. */ + RAFT_VOTER, /* Replicate log, does participate in quorum. */ + RAFT_SPARE /* Does not replicate log, or participate in quorum. */ +}; + +/** + * Hold information about a single server in the cluster configuration. + * WARNING: This struct is encoded/decoded, be careful when adapting it. + */ +struct raft_server +{ + raft_id id; /* Server ID, must be greater than zero. */ + char *address; /* Server address. User defined. */ + int role; /* Server role. */ +}; + +/** + * Hold information about all servers currently part of the cluster. + * WARNING: This struct is encoded/decoded, be careful when adapting it. + */ +struct raft_configuration +{ + struct raft_server + *servers; /* Array of servers member of the cluster. */ + unsigned n; /* Number of servers in the array. */ +}; + +/** + * Initialize an empty raft configuration. + */ +RAFT_API void raft_configuration_init(struct raft_configuration *c); + +/** + * Release all memory used by the given configuration object. + */ +RAFT_API void raft_configuration_close(struct raft_configuration *c); + +/** + * Add a server to a raft configuration. + * + * The @id must be greater than zero and @address point to a valid string. + * + * The @role must be either #RAFT_VOTER, #RAFT_STANDBY, #RAFT_SPARE. + * + * If @id or @address are already in use by another server in the configuration, + * an error is returned. + * + * The @address string will be copied and can be released after this function + * returns. + */ +RAFT_API int raft_configuration_add(struct raft_configuration *c, + raft_id id, + const char *address, + int role); + +/** + * Encode the given configuration object. + * + * The memory of the returned buffer is allocated using raft_malloc(), and + * client code is responsible for releasing it when no longer needed. + */ +RAFT_API int raft_configuration_encode(const struct raft_configuration *c, + struct raft_buffer *buf); + +/** + * Hash function which outputs a 64-bit value based on a text and a number. + * + * This can be used to generate a unique ID for a new server being added, for + * example based on its address and on the current time in milliseconds since + * the Epoch. + * + * It's internally implemented as a SHA1 where only the last 8 bytes of the hash + * value are kept. + */ +RAFT_API unsigned long long raft_digest(const char *text, unsigned long long n); + +/** + * Log entry types. + */ +enum { + RAFT_COMMAND = 1, /* Command for the application FSM. */ + RAFT_BARRIER, /* Wait for all previous commands to be applied. */ + RAFT_CHANGE /* Raft configuration change. */ +}; + +/** + * A single entry in the raft log. + * + * An entry that originated from this raft instance while it was the leader + * (typically via client calls to raft_apply()) should normally have a @buf + * attribute referencing directly the memory that was originally allocated by + * the client itself to contain the entry data, and the @batch attribute set to + * #NULL. + * + * An entry that was received from the network as part of an AppendEntries RPC + * or that was loaded from disk at startup should normally have a @batch + * attribute that points to a contiguous chunk of memory that contains the data + * of the entry itself plus possibly the data for other entries that were + * received or loaded with it at the same time. In this case the @buf pointer + * will be equal to the @batch pointer plus an offset, that locates the position + * of the entry's data within the batch. + * + * When the @batch attribute is not #NULL the raft library will take care of + * releasing that memory only once there are no more references to the + * associated entries. + * + * This arrangement makes it possible to minimize the amount of memory-copying + * when performing I/O. + */ +struct raft_entry +{ + raft_term term; /* Term in which the entry was created. */ + unsigned short type; /* Type (FSM command, barrier, config change). */ + struct raft_buffer buf; /* Entry data. */ + void *batch; /* Batch that buf's memory points to, if any. */ +}; + +/** + * Hold the arguments of a RequestVote RPC. + * + * The RequestVote RPC is invoked by candidates to gather votes. + */ +struct raft_request_vote +{ + int version; + raft_term term; /* Candidate's term. */ + raft_id candidate_id; /* ID of the server requesting the vote. */ + raft_index last_log_index; /* Index of candidate's last log entry. */ + raft_index last_log_term; /* Term of log entry at last_log_index. */ + bool disrupt_leader; /* True if current leader should be discarded. */ + bool pre_vote; /* True if this is a pre-vote request. */ +}; +#define RAFT_REQUEST_VOTE_VERSION 2 + +/** + * Hold the result of a RequestVote RPC. + */ +struct raft_request_vote_result +{ + int version; + raft_term + term; /* Receiver's current term (candidate updates itself). */ + bool vote_granted; /* True means candidate received vote. */ + bool pre_vote; /* The response to a pre-vote RequestVote or not. */ +}; +#define RAFT_REQUEST_VOTE_RESULT_VERSION 2 + +/** + * Hold the arguments of an AppendEntries RPC. + * + * The AppendEntries RPC is invoked by the leader to replicate log entries. It's + * also used as heartbeat (figure 3.1). + */ +struct raft_append_entries +{ + int version; + raft_term term; /* Leader's term. */ + raft_index prev_log_index; /* Index of log entry preceeding new ones. */ + raft_term prev_log_term; /* Term of entry at prev_log_index. */ + raft_index leader_commit; /* Leader's commit index. */ + struct raft_entry *entries; /* Log entries to append. */ + unsigned n_entries; /* Size of the log entries array. */ +}; +#define RAFT_APPEND_ENTRIES_VERSION 0 + +/** + * Hold the result of an AppendEntries RPC (figure 3.1). + */ +struct raft_append_entries_result +{ + int version; + raft_term term; /* Receiver's current_term. */ + raft_index rejected; /* If non-zero, the index that was rejected. */ + raft_index + last_log_index; /* Receiver's last log entry index, as hint. */ + raft_flags features; /* Feature flags. */ +}; +#define RAFT_APPEND_ENTRIES_RESULT_VERSION 1 + +/** + * Hold the arguments of an InstallSnapshot RPC (figure 5.3). + */ +struct raft_install_snapshot +{ + int version; + raft_term term; /* Leader's term. */ + raft_index last_index; /* Index of last entry in the snapshot. */ + raft_term last_term; /* Term of last_index. */ + struct raft_configuration conf; /* Config as of last_index. */ + raft_index conf_index; /* Commit index of conf. */ + struct raft_buffer data; /* Raw snapshot data. */ +}; +#define RAFT_INSTALL_SNAPSHOT_VERSION 0 + +/** + * Hold the arguments of a TimeoutNow RPC. + * + * The TimeoutNow RPC is invoked by leaders to transfer leadership to a + * follower. + */ +struct raft_timeout_now +{ + int version; + raft_term term; /* Leader's term. */ + raft_index last_log_index; /* Index of leader's last log entry. */ + raft_index last_log_term; /* Term of log entry at last_log_index. */ +}; +#define RAFT_TIMEOUT_NOW_VERSION 0 + +/** + * Type codes for RPC messages. + */ +enum { + RAFT_IO_APPEND_ENTRIES = 1, + RAFT_IO_APPEND_ENTRIES_RESULT, + RAFT_IO_REQUEST_VOTE, + RAFT_IO_REQUEST_VOTE_RESULT, + RAFT_IO_INSTALL_SNAPSHOT, + RAFT_IO_TIMEOUT_NOW +}; + +/** + * A single RPC message that can be sent or received over the network. + * + * The RPC message types all have a `version` field. + * In the libuv io implementation, `version` is filled out during decoding + * and is based on the size of the message on the wire, see e.g. + * `sizeofRequestVoteV1`. The version number in the RAFT_MESSAGE_XXX_VERSION + * macro needs to be bumped every time the message is updated. + * + * Notes when adding a new message type to raft: + * raft_io implementations compiled against old versions of raft don't know the + * new message type and possibly have not allocated enough space for it. When + * such an application receives a new message over the wire, the raft_io + * implementation will err out or drop the message, because it doesn't know how + * to decode it based on its type. + * raft_io implementations compiled against versions of raft that know the new + * message type but at runtime are linked against an older raft lib, will pass + * the message to raft, where raft will drop it. + * When raft receives a message and accesses a field of a new message type, + * the raft_io implementation must have known about the new message type, + * so it was compiled against a modern enough version of raft, and memory + * accesses should be safe. + * + * Sending a new message type with a raft_io implementation that doesn't know + * the type is safe, the implementation should drop the message based on its + * type and will not try to access fields it doesn't know the existence of. + */ +struct raft_message +{ + unsigned short type; /* RPC type code. */ + raft_id server_id; /* ID of sending or destination server. */ + const char + *server_address; /* Address of sending or destination server. */ + union { /* Type-specific data */ + struct raft_request_vote request_vote; + struct raft_request_vote_result request_vote_result; + struct raft_append_entries append_entries; + struct raft_append_entries_result append_entries_result; + struct raft_install_snapshot install_snapshot; + struct raft_timeout_now timeout_now; + }; +}; + +/** + * Hold the details of a snapshot. + * The user-provided raft_buffer structs should provide the user with enough + * flexibility to adapt/evolve snapshot formats. + * If this struct would NEED to be adapted in the future, raft can always move + * to a new struct with a new name and a new raft_io version. + */ +struct raft_snapshot +{ + /* Index and term of last entry included in the snapshot. */ + raft_index index; + raft_term term; + + /* Last committed configuration included in the snapshot, along with the + * index it was committed at. */ + struct raft_configuration configuration; + raft_index configuration_index; + + /* Content of the snapshot. When a snapshot is taken, the user FSM can + * fill the bufs array with more than one buffer. When a snapshot is + * restored, there will always be a single buffer. */ + struct raft_buffer *bufs; + unsigned n_bufs; +}; + +/** + * Asynchronous request to send an RPC message. + */ +struct raft_io_send; +typedef void (*raft_io_send_cb)(struct raft_io_send *req, int status); +struct raft_io_send +{ + void *data; /* User data */ + raft_io_send_cb cb; /* Request callback */ +}; + +/** + * Asynchronous request to store new log entries. + */ +struct raft_io_append; +typedef void (*raft_io_append_cb)(struct raft_io_append *req, int status); +struct raft_io_append +{ + void *data; /* User data */ + raft_io_append_cb cb; /* Request callback */ +}; + +/** + * Asynchronous request to store a new snapshot. + */ +struct raft_io_snapshot_put; +typedef void (*raft_io_snapshot_put_cb)(struct raft_io_snapshot_put *req, + int status); +struct raft_io_snapshot_put +{ + void *data; /* User data */ + raft_io_snapshot_put_cb cb; /* Request callback */ +}; + +/** + * Asynchronous request to load the most recent snapshot available. + */ +struct raft_io_snapshot_get; +typedef void (*raft_io_snapshot_get_cb)(struct raft_io_snapshot_get *req, + struct raft_snapshot *snapshot, + int status); +struct raft_io_snapshot_get +{ + void *data; /* User data */ + raft_io_snapshot_get_cb cb; /* Request callback */ +}; + +/** + * Asynchronous work request. + */ +struct raft_io_async_work; +typedef int (*raft_io_async_work_fn)(struct raft_io_async_work *req); +typedef void (*raft_io_async_work_cb)(struct raft_io_async_work *req, + int status); +struct raft_io_async_work +{ + void *data; /* User data */ + raft_io_async_work_fn + work; /* Function to run async from the main loop */ + raft_io_async_work_cb cb; /* Request callback */ +}; + +/** + * Customizable tracer, for debugging purposes. + */ +struct raft_tracer +{ + /** + * Implementation-defined state object. + */ + void *impl; + + /** + * Whether this tracer should emit messages. + */ + bool enabled; + + /** + * Trace level. + */ + unsigned level; + + /** + * Emit the given trace message, possibly decorating it with the + * provided metadata. + */ + void (*emit)(struct raft_tracer *t, + const char *file, + unsigned int line, + const char *func, + unsigned int level, + const char *message); +}; + +struct raft_io; /* Forward declaration. */ + +/** + * Callback invoked by the I/O implementation at regular intervals. + */ +typedef void (*raft_io_tick_cb)(struct raft_io *io); + +/** + * Callback invoked by the I/O implementation when an RPC message is received. + */ +typedef void (*raft_io_recv_cb)(struct raft_io *io, struct raft_message *msg); + +typedef void (*raft_io_close_cb)(struct raft_io *io); + +/** + * version field MUST be filled out by user. + * When moving to a new version, the user MUST implement the newly added + * methods. + */ +struct raft_io +{ + int version; /* 1 or 2 */ + void *data; + void *impl; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int (*init)(struct raft_io *io, raft_id id, const char *address); + void (*close)(struct raft_io *io, raft_io_close_cb cb); + int (*load)(struct raft_io *io, + raft_term *term, + raft_id *voted_for, + struct raft_snapshot **snapshot, + raft_index *start_index, + struct raft_entry *entries[], + size_t *n_entries); + int (*start)(struct raft_io *io, + unsigned msecs, + raft_io_tick_cb tick, + raft_io_recv_cb recv); + int (*bootstrap)(struct raft_io *io, + const struct raft_configuration *conf); + int (*recover)(struct raft_io *io, + const struct raft_configuration *conf); + int (*set_term)(struct raft_io *io, raft_term term); + int (*set_vote)(struct raft_io *io, raft_id server_id); + int (*send)(struct raft_io *io, + struct raft_io_send *req, + const struct raft_message *message, + raft_io_send_cb cb); + int (*append)(struct raft_io *io, + struct raft_io_append *req, + const struct raft_entry entries[], + unsigned n, + raft_io_append_cb cb); + int (*truncate)(struct raft_io *io, raft_index index); + int (*snapshot_put)(struct raft_io *io, + unsigned trailing, + struct raft_io_snapshot_put *req, + const struct raft_snapshot *snapshot, + raft_io_snapshot_put_cb cb); + int (*snapshot_get)(struct raft_io *io, + struct raft_io_snapshot_get *req, + raft_io_snapshot_get_cb cb); + raft_time (*time)(struct raft_io *io); + int (*random)(struct raft_io *io, int min, int max); + /* Field(s) below added since version 2. */ + int (*async_work)(struct raft_io *io, + struct raft_io_async_work *req, + raft_io_async_work_cb cb); +}; + +/** + * version field MUST be filled out by user. + * When moving to a new version, the user MUST initialize the new methods, + * either with an implementation or with NULL. + * + * version 2: + * introduces `snapshot_finalize`, when this method is not NULL, it will + * always run after a successful call to `snapshot`, whether the snapshot has + * been successfully written to disk or not. If it is set, raft will + * assume no ownership of any of the `raft_buffer`s and the responsibility to + * clean up lies with the user of raft. + * `snapshot_finalize` can be used to e.g. release a lock that was taken during + * a call to `snapshot`. Until `snapshot_finalize` is called, raft can access + * the data contained in the `raft_buffer`s. + * + * version 3: + * Adds support for async snapshots through the `snapshot_async` function. + * When this method is provided, raft will call `snapshot` in the main loop, + * and when successful, will call `snapshot_async` using the `io->async_work` + * method, so blocking I/O calls are allowed in the implementation. After the + * `snapshot_async` completes, `snapshot_finalize` will be called in the main + * loop, independent of the return value of `snapshot_async`. + * An implementation that does not use asynchronous snapshots MUST set + * `snapshot_async` to NULL. + * All memory allocated by the snapshot routines MUST be freed by the snapshot + * routines themselves. + */ + +struct raft_fsm +{ + int version; /* 1, 2 or 3 */ + void *data; + int (*apply)(struct raft_fsm *fsm, + const struct raft_buffer *buf, + void **result); + int (*snapshot)(struct raft_fsm *fsm, + struct raft_buffer *bufs[], + unsigned *n_bufs); + int (*restore)(struct raft_fsm *fsm, struct raft_buffer *buf); + /* Fields below added since version 2. */ + int (*snapshot_finalize)(struct raft_fsm *fsm, + struct raft_buffer *bufs[], + unsigned *n_bufs); + /* Fields below added since version 3. */ + int (*snapshot_async)(struct raft_fsm *fsm, + struct raft_buffer *bufs[], + unsigned *n_bufs); +}; + +struct raft; /* Forward declaration. */ + +/** + * State codes. + */ +enum { RAFT_UNAVAILABLE, RAFT_FOLLOWER, RAFT_CANDIDATE, RAFT_LEADER }; + +/** + * State callback to invoke if raft's state changes. + */ +typedef void (*raft_state_cb)(struct raft *raft, + unsigned short old_state, + unsigned short new_state); + +struct raft_progress; + +/** + * Close callback. + * + * It's safe to release the memory of a raft instance only after this callback + * has fired. + */ +typedef void (*raft_close_cb)(struct raft *raft); + +struct raft_change; /* Forward declaration */ +struct raft_transfer; /* Forward declaration */ + +struct raft_log; + +/** + * Hold and drive the state of a single raft server in a cluster. + * When replacing reserved fields in the middle of this struct, you MUST use a + * type with the same size and alignment requirements as the original type. + */ +struct raft +{ + void *data; /* Custom user data. */ + struct raft_tracer *tracer; /* Tracer implementation. */ + struct raft_io *io; /* Disk and network I/O implementation. */ + struct raft_fsm *fsm; /* User-defined FSM to apply commands to. */ + raft_id id; /* Server ID of this raft instance. */ + char *address; /* Server address of this raft instance. */ + + /* + * Cache of the server's persistent state, updated on stable storage + * before responding to RPCs (Figure 3.1). + */ + raft_term current_term; /* Latest term server has seen. */ + raft_id voted_for; /* Candidate that received vote in current term. */ + struct raft_log *log; /* Log entries. */ + + /* + * Current membership configuration (Chapter 4). + * + * At any given moment the current configuration can be committed or + * uncommitted. + * + * If a server is voting, the log entry with index 1 must always contain + * the first committed configuration. + * + * At all times #configuration_committed_index is either zero or is the + * index of the most recent log entry of type #RAFT_CHANGE that we know + * to be committed. That means #configuration_committed_index is always + * equal or lower than #commit_index. + * + * At all times #configuration_uncommitted_index is either zero or is + * the index of an uncommitted log entry of type #RAFT_CHANGE. There can + * be at most one uncommitted entry of type #RAFT_CHANGE because we + * allow only one configuration change at a time. + * + * At all times #configuration_last_snapshot is a copy of the + * configuration contained the most recent snapshot, if any. + * + * The possible scenarios are: + * + * 1. #configuration_committed_index and + * #configuration_uncommitted_index are both zero. This should only + * happen when a brand new server starts joining a cluster and is + * waiting to receive log entries from the current leader. In this case + * #configuration and #configuration_last_snapshot must be empty and + * have no servers. + * + * 2. #configuration_committed_index is non-zero and + * #configuration_uncommitted_index is zero. This means that + * #configuration is committed and there is no pending configuration + * change. The content of #configuration must match the one of the + * log entry at #configuration_committed_index. + * + * 3. #configuration_committed_index and + * #configuration_uncommitted_index are both non-zero, with the latter + * being greater than the former. This means that #configuration is + * uncommitted and represents a pending configuration change. The + * content of #configuration must match the one of the log entry at + * #configuration_uncommitted_index. + * + * When a snapshot is taken, a copy of the most recent configuration + * known to be committed (i.e. the configuration contained in the log + * entry at #configuration_committed_index) is saved in + * #configuration_last_snapshot, so it can be easily retrieved in case + * the log gets truncated because of compaction and does not contain the + * entry at #configuration_committed_index anymore. Likewise, if a + * snapshot is restored its associated configuration is saved in + * #configuration_last_snapshot. + */ + struct raft_configuration configuration; + struct raft_configuration configuration_last_snapshot; + raft_index configuration_committed_index; + raft_index configuration_uncommitted_index; + + /* + * Election timeout in milliseconds (default 1000). + * + * From 3.4: + * + * Raft uses a heartbeat mechanism to trigger leader election. When + * servers start up, they begin as followers. A server remains in + * follower state as long as it receives valid RPCs from a leader or + * candidate. Leaders send periodic heartbeats (AppendEntries RPCs + * that carry no log entries) to all followers in order to maintain + * their authority. If a follower receives no communication over a + * period of time called the election timeout, then it assumes there is + * no viable leader and begins an election to choose a new leader. + * + * This is the baseline value and will be randomized between 1x and 2x. + * + * See raft_change_election_timeout() to customize the value of this + * attribute. + */ + unsigned election_timeout; + + /* + * Heartbeat timeout in milliseconds (default 100). This is relevant + * only for when the raft instance is in leader state: empty + * AppendEntries RPCs will be sent if this amount of milliseconds + * elapses without any user-triggered AppendEntries RCPs being sent. + * + * From Figure 3.1: + * + * [Leaders] Send empty AppendEntries RPC during idle periods to + * prevent election timeouts. + */ + unsigned heartbeat_timeout; + + /* + * When the leader sends an InstallSnapshot RPC to a follower it will + * consider the RPC as failed after this timeout and retry. + */ + unsigned install_snapshot_timeout; + + /* + * The fields below hold the part of the server's volatile state which + * is always applicable regardless of the whether the server is + * follower, candidate or leader (Figure 3.1). This state is rebuilt + * automatically after a server restart. + */ + raft_index commit_index; /* Highest log entry known to be committed */ + raft_index last_applied; /* Highest log entry applied to the FSM */ + raft_index last_stored; /* Highest log entry persisted on disk */ + + /* + * Current server state of this raft instance, along with a union + * defining state-specific values. + */ + unsigned short state; + union { + struct /* Follower */ + { + unsigned + randomized_election_timeout; /* Timer expiration. */ + struct /* Current leader info. */ + { + raft_id id; + char *address; + } current_leader; + uint64_t append_in_flight_count; + uint64_t reserved[7]; /* Future use */ + } follower_state; + struct + { + unsigned + randomized_election_timeout; /* Timer expiration. */ + bool *votes; /* Vote results. */ + bool disrupt_leader; /* For leadership transfer */ + bool in_pre_vote; /* True in pre-vote phase. */ + uint64_t reserved[8]; /* Future use */ + } candidate_state; + struct + { + struct raft_progress + *progress; /* Per-server replication state. */ + struct raft_change + *change; /* Pending membership change. */ + raft_id promotee_id; /* ID of server being promoted. */ + unsigned short round_number; /* Current sync round. */ + raft_index + round_index; /* Target of the current round. */ + raft_time round_start; /* Start of current round. */ + void *requests[2]; /* Outstanding client requests. */ + uint32_t + voter_contacts; /* Current number of voting nodes we + are in contact with */ + uint32_t reserved2; /* Future use */ + uint64_t reserved[7]; /* Future use */ + } leader_state; + }; + + /* Election timer start. + * + * This timer has different purposes depending on the state. Followers + * convert to candidate after the randomized election timeout has + * elapsed without leader contact. Candidates start a new election after + * the randomized election timeout has elapsed without a winner. Leaders + * step down after the election timeout has elapsed without contacting a + * majority of voting servers. */ + raft_time election_timer_start; + + /* In-progress leadership transfer request, if any. */ + struct raft_transfer *transfer; + + /* + * Information about the last snapshot that was taken (if any). + */ + struct + { + unsigned threshold; /* N. of entries before snapshot */ + unsigned trailing; /* N. of trailing entries to retain */ + struct raft_snapshot pending; /* In progress snapshot */ + struct raft_io_snapshot_put put; /* Store snapshot request */ + uint64_t reserved[8]; /* Future use */ + } snapshot; + + /* + * Callback to invoke once a close request has completed. + */ + raft_close_cb close_cb; + + /* + * Human-readable message providing diagnostic information about the + * last error occurred. + */ + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + + /* Whether to use pre-vote to avoid disconnected servers disrupting the + * current leader, as described in 4.2.3 and 9.6. */ + bool pre_vote; + + /* Limit how long to wait for a stand-by to catch-up with the log when + * its being promoted to voter. */ + unsigned max_catch_up_rounds; + unsigned max_catch_up_round_duration; + + /* uint64_t because we used a reserved field. In reality this a pointer + * to a `struct raft_callbacks` that can be used to store e.g. various + * user-supplied callbacks. */ + uint64_t callbacks; + + /* Future extensions */ + uint64_t reserved[31]; +}; + +RAFT_API int raft_init(struct raft *r, + struct raft_io *io, + struct raft_fsm *fsm, + raft_id id, + const char *address); + +RAFT_API void raft_close(struct raft *r, raft_close_cb cb); + +/** + * This function MUST be called after raft_init and before raft_start. + * @cb will be called every time the raft state changes. + */ +RAFT_API void raft_register_state_cb(struct raft *r, raft_state_cb cb); + +/** + * Bootstrap this raft instance using the given configuration. The instance must + * not have been started yet and must be completely pristine, otherwise + * #RAFT_CANTBOOTSTRAP will be returned. + */ +RAFT_API int raft_bootstrap(struct raft *r, + const struct raft_configuration *conf); + +/** + * Force a new configuration in order to recover from a loss of quorum where the + * current configuration cannot be restored, such as when a majority of servers + * die at the same time. + * + * This works by appending the new configuration directly to the log stored on + * disk. + * + * In order for this operation to be safe you must follow these steps: + * + * 1. Make sure that no servers in the cluster are running, either because they + * died or because you manually stopped them. + * + * 2. Run @raft_recover exactly one time, on the non-dead server which has + * the highest term and the longest log. + * + * 3. Copy the data directory of the server you ran @raft_recover on to all + * other non-dead servers in the cluster, replacing their current data + * directory. + * + * 4. Restart all servers. + */ +RAFT_API int raft_recover(struct raft *r, + const struct raft_configuration *conf); + +RAFT_API int raft_start(struct raft *r); + +/** + * Set the election timeout. + * + * Every raft instance is initialized with a default election timeout of 1000 + * milliseconds. If you wish to tweak it, call this function before starting + * your event loop. + * + * From Chapter 9: + * + * We recommend a range that is 10-20 times the one-way network latency, which + * keeps split votes rates under 40% in all cases for reasonably sized + * clusters, and typically results in much lower rates. + * + * Note that the current random election timer will be reset and a new one timer + * will be generated. + */ +RAFT_API void raft_set_election_timeout(struct raft *r, unsigned msecs); + +/** + * Set the heartbeat timeout. + */ +RAFT_API void raft_set_heartbeat_timeout(struct raft *r, unsigned msecs); + +/** + * Set the snapshot install timeout. + */ +RAFT_API void raft_set_install_snapshot_timeout(struct raft *r, unsigned msecs); + +/** + * Number of outstanding log entries before starting a new snapshot. The default + * is 1024. + */ +RAFT_API void raft_set_snapshot_threshold(struct raft *r, unsigned n); + +/** + * Enable or disable pre-vote support. Pre-vote is turned off by default. + */ +RAFT_API void raft_set_pre_vote(struct raft *r, bool enabled); + +/** + * Number of outstanding log entries to keep in the log after a snapshot has + * been taken. This avoids sending snapshots when a follower is behind by just a + * few entries. The default is 128. + */ +RAFT_API void raft_set_snapshot_trailing(struct raft *r, unsigned n); + +/** + * Set the maximum number of a catch-up rounds to try when replicating entries + * to a stand-by server that is being promoted to voter, before giving up and + * failing the configuration change. The default is 10. + */ +RAFT_API void raft_set_max_catch_up_rounds(struct raft *r, unsigned n); + +/** + * Set the maximum duration of a catch-up round when replicating entries to a + * stand-by server that is being promoted to voter. The default is 5 seconds. + */ +RAFT_API void raft_set_max_catch_up_round_duration(struct raft *r, + unsigned msecs); + +/** + * Return a human-readable description of the last error occurred. + */ +RAFT_API const char *raft_errmsg(struct raft *r); + +/** + * Return the code of the current raft state (follower/candidate/leader). + */ +RAFT_API int raft_state(struct raft *r); + +/** + * Return the code of the current raft role (spare/standby/voter), + * or -1 if this server is not in the current configuration. + */ +RAFT_API int raft_role(struct raft *r); + +/** + * Return the ID and address of the current known leader, if any. + */ +RAFT_API void raft_leader(struct raft *r, raft_id *id, const char **address); + +/** + * Return the index of the last entry that was appended to the local log. + */ +RAFT_API raft_index raft_last_index(struct raft *r); + +/** + * Return the index of the last entry that was applied to the local FSM. + */ +RAFT_API raft_index raft_last_applied(struct raft *r); + +/** + * Return the number of voting servers that the leader has recently been in + * contact with. This can be used to help determine whether the cluster may be + * in a degraded/at risk state. + * + * Returns valid values >= 1, because a leader is always in contact with + * itself. + * Returns -1 if called on a follower. + * + * Note that the value returned may be out of date, and so should not be relied + * upon for absolute correctness. + */ +RAFT_API int raft_voter_contacts(struct raft *r); + +/** + * Common fields across client request types. + * `req_id`, `client_id` and `unique_id` are currently unused. + * `reserved` fields should be replaced by new members with the same size + * and alignment requirements as `uint64_t`. + */ +#define RAFT__REQUEST \ + void *data; \ + int type; \ + raft_index index; \ + void *queue[2]; \ + uint8_t req_id[16]; \ + uint8_t client_id[16]; \ + uint8_t unique_id[16]; \ + uint64_t reserved[4] + +/** + * Asynchronous request to append a new command entry to the log and apply it to + * the FSM when a quorum is reached. + */ +struct raft_apply; +typedef void (*raft_apply_cb)(struct raft_apply *req, int status, void *result); +struct raft_apply +{ + RAFT__REQUEST; + raft_apply_cb cb; +}; + +/** + * Propose to append commands to the log and apply them to the FSM once + * committed. + * + * If this server is the leader, it will create @n new log entries of type + * #RAFT_COMMAND using the given buffers as their payloads, append them to its + * own log and attempt to replicate them on other servers by sending + * AppendEntries RPCs. + * + * The memory pointed at by the @base attribute of each #raft_buffer in the + * given array must have been allocated with raft_malloc() or a compatible + * allocator. If this function returns 0, the ownership of this memory is + * implicitly transferred to the raft library, which will take care of releasing + * it when appropriate. Any further client access to such memory leads to + * undefined behavior. + * + * The ownership of the memory of the @bufs array itself is not transferred to + * the raft library, and, if allocated dynamically, must be deallocated by the + * caller. + * + * If the command was successfully applied, r->last_applied will be equal to + * the log entry index of the applied command when the cb is invoked. + */ +RAFT_API int raft_apply(struct raft *r, + struct raft_apply *req, + const struct raft_buffer bufs[], + const unsigned n, + raft_apply_cb cb); + +/** + * Asynchronous request to append a barrier entry. + */ +struct raft_barrier; +typedef void (*raft_barrier_cb)(struct raft_barrier *req, int status); +struct raft_barrier +{ + RAFT__REQUEST; + raft_barrier_cb cb; +}; + +/** + * Propose to append a log entry of type #RAFT_BARRIER. + * + * This can be used to ensure that there are no unapplied commands. + */ +RAFT_API int raft_barrier(struct raft *r, + struct raft_barrier *req, + raft_barrier_cb cb); + +/** + * Asynchronous request to change the raft configuration. + */ +typedef void (*raft_change_cb)(struct raft_change *req, int status); +struct raft_change +{ + RAFT__REQUEST; + raft_change_cb cb; +}; + +/** + * Add a new server to the cluster configuration. Its initial role will be + * #RAFT_SPARE. + */ +RAFT_API int raft_add(struct raft *r, + struct raft_change *req, + raft_id id, + const char *address, + raft_change_cb cb); + +/** + * Assign a new role to the given server. + * + * If the server has already the given role, or if the given role is unknown, + * #RAFT_BADROLE is returned. + */ +RAFT_API int raft_assign(struct raft *r, + struct raft_change *req, + raft_id id, + int role, + raft_change_cb cb); + +/** + * Remove the given server from the cluster configuration. + */ +RAFT_API int raft_remove(struct raft *r, + struct raft_change *req, + raft_id id, + raft_change_cb cb); + +/** + * Asynchronous request to transfer leadership. + */ +typedef void (*raft_transfer_cb)(struct raft_transfer *req); +struct raft_transfer +{ + RAFT__REQUEST; + raft_id id; /* ID of target server. */ + raft_time start; /* Start of leadership transfer. */ + struct raft_io_send send; /* For sending TimeoutNow */ + raft_transfer_cb cb; /* User callback */ +}; + +/** + * Transfer leadership to the server with the given ID. + * + * If the target server is not part of the configuration, or it's the leader + * itself, or it's not a #RAFT_VOTER, then #RAFT_BADID is returned. + * + * The special value #0 means to automatically select a voting follower to + * transfer leadership to. If there are no voting followers, return + * #RAFT_NOTFOUND. + * + * When this server detects that the target server has become the leader, or + * when @election_timeout milliseconds have elapsed, the given callback will be + * invoked. + * + * After the callback files, clients can check whether the operation was + * successful or not by calling @raft_leader() and checking if it returns the + * target server. + */ +RAFT_API int raft_transfer(struct raft *r, + struct raft_transfer *req, + raft_id id, + raft_transfer_cb cb); + +/** + * User-definable dynamic memory allocation functions. + * + * The @data field will be passed as first argument to all functions. + */ +struct raft_heap +{ + void *data; /* User data */ + void *(*malloc)(void *data, size_t size); + void (*free)(void *data, void *ptr); + void *(*calloc)(void *data, size_t nmemb, size_t size); + void *(*realloc)(void *data, void *ptr, size_t size); + void *(*aligned_alloc)(void *data, size_t alignment, size_t size); + void (*aligned_free)(void *data, size_t alignment, void *ptr); +}; + +DQLITE_VISIBLE_TO_TESTS void *raft_malloc(size_t size); +DQLITE_VISIBLE_TO_TESTS void raft_free(void *ptr); +DQLITE_VISIBLE_TO_TESTS void *raft_calloc(size_t nmemb, size_t size); +DQLITE_VISIBLE_TO_TESTS void *raft_realloc(void *ptr, size_t size); +DQLITE_VISIBLE_TO_TESTS void *raft_aligned_alloc(size_t alignment, size_t size); +DQLITE_VISIBLE_TO_TESTS void raft_aligned_free(size_t alignment, void *ptr); + +/** + * Use a custom dynamic memory allocator. + */ +DQLITE_VISIBLE_TO_TESTS void raft_heap_set(struct raft_heap *heap); + +/** + * Use the default dynamic memory allocator (from the stdlib). This clears any + * custom allocator specified with @raft_heap_set. + */ +DQLITE_VISIBLE_TO_TESTS void raft_heap_set_default(void); + +/** + * Return a reference to the current dynamic memory allocator. + * + * This is intended for use by applications that want to temporarily replace + * and then restore the original allocator, or that want to defer to the + * original allocator in some circumstances. + * + * The behavior of attempting to mutate the default allocator through the + * pointer returned by this function, including attempting to deallocate + * the backing memory, is undefined. + */ +DQLITE_VISIBLE_TO_TESTS const struct raft_heap *raft_heap_get(void); + +#undef RAFT__REQUEST + +struct raft_uv_transport; + +/** + * Configure the given @raft_io instance to use a libuv-based I/O + * implementation. + * + * The @dir path will be copied, and its memory can possibly be released once + * this function returns. + * + * Return #RAFT_NAMETOOLONG if @dir exceeds the size of the internal buffer + * that should hold it + * + * Return #RAFT_NOTFOUND if @dir does not exist. + * + * Return #RAFT_INVALID if @dir exists but it's not a directory. + * + * The implementation of metadata and log persistency is virtually the same as + * the one found in LogCabin [0]. + * + * The disk files consist of metadata files, closed segments, and open + * segments. Metadata files are used to track Raft metadata, such as the + * server's current term, vote, and log's start index. Segments contain + * contiguous entries that are part of the log. Closed segments are never + * written to again (but may be renamed and truncated if a suffix of the log is + * truncated). Open segments are where newly appended entries go. Once an open + * segment reaches the maximum allowed size, it is closed and a new one is used. + * + * Metadata files are named "metadata1" and "metadata2". The code alternates + * between these so that there is always at least one readable metadata file. + * On boot, the readable metadata file with the higher version number is used. + * + * The format of a metadata file is: + * + * [8 bytes] Format (currently 1). + * [8 bytes] Incremental version number. + * [8 bytes] Current term. + * [8 bytes] ID of server we voted for. + * + * Closed segments are named by the format string "%lu-%lu" with their + * start and end indexes, both inclusive. Closed segments always contain at + * least one entry; the end index is always at least as large as the start + * index. Closed segment files may occasionally include data past their + * filename's end index (these are ignored but a warning is logged). This can + * happen if the suffix of the segment is truncated and a crash occurs at an + * inopportune time (the segment file is first renamed, then truncated, and a + * crash occurs in between). + * + * Open segments are named by the format string "open-%lu" with a unique + * number. These should not exist when the server shuts down cleanly, but they + * exist while the server is running and may be left around during a crash. + * Open segments either contain entries which come after the last closed + * segment or are full of zeros. When the server crashes while appending to an + * open segment, the end of that file may be corrupt. We can't distinguish + * between a corrupt file and a partially written entry. The code assumes it's + * a partially written entry, logs a warning, and ignores it. + * + * Truncating a suffix of the log will remove all entries that are no longer + * part of the log. Truncating a prefix of the log will only remove complete + * segments that are before the new log start index. For example, if a + * segment has entries 10 through 20 and the prefix of the log is truncated to + * start at entry 15, that entire segment will be retained. + * + * Each segment file starts with a segment header, which currently contains + * just an 8-byte version number for the format of that segment. The current + * format (version 1) is just a concatenation of serialized entry batches. + * + * Each batch has the following format: + * + * [4 bytes] CRC32 checksum of the batch header, little endian. + * [4 bytes] CRC32 checksum of the batch data, little endian. + * [ ... ] Batch (as described in @raft_decode_entries_batch). + * + * [0] https://github.com/logcabin/logcabin/blob/master/Storage/SegmentedLog.h + */ +RAFT_API int raft_uv_init(struct raft_io *io, + struct uv_loop_s *loop, + const char *dir, + struct raft_uv_transport *transport); + +/** + * Release any memory allocated internally. + */ +RAFT_API void raft_uv_close(struct raft_io *io); + +/** + * Set the block size that will be used for direct I/O. + * + * The default is to automatically detect the appropriate block size. + */ +RAFT_API void raft_uv_set_block_size(struct raft_io *io, size_t size); + +/** + * Set the maximum initial size of newly created open segments. + * + * If the given size is not a multiple of the block size, the actual size will + * be reduced to the closest multiple. + * + * The default is 8 megabytes. + */ +RAFT_API void raft_uv_set_segment_size(struct raft_io *io, size_t size); + +/** + * Turn snapshot compression on or off. + * Returns non-0 on failure, this can e.g. happen when compression is requested + * while no suitable compression library is found. + * + * By default snapshots are compressed if the appropriate libraries are found. + */ +RAFT_API int raft_uv_set_snapshot_compression(struct raft_io *io, + bool compressed); + +/** + * Set how many milliseconds to wait between subsequent retries when + * establishing a connection with another server. The default is 1000 + * milliseconds. + */ +RAFT_API void raft_uv_set_connect_retry_delay(struct raft_io *io, + unsigned msecs); + +/** + * Emit low-level debug messages using the given tracer. + */ +RAFT_API void raft_uv_set_tracer(struct raft_io *io, + struct raft_tracer *tracer); + +/** + * Enable or disable auto-recovery on startup. Default enabled. + */ +RAFT_API void raft_uv_set_auto_recovery(struct raft_io *io, bool flag); + +/** + * Callback invoked by the transport implementation when a new incoming + * connection has been established. + * + * No references to @address must be kept after this function returns. + * + * Ownership of @stream is transferred to user code, which is responsible of + * uv_close()'ing it and then releasing its memory. + */ +typedef void (*raft_uv_accept_cb)(struct raft_uv_transport *t, + raft_id id, + const char *address, + struct uv_stream_s *stream); + +/** + * Callback invoked by the transport implementation after a connect request has + * completed. If status is #0, then @stream will point to a valid handle, which + * user code is then responsible to uv_close() and then release. + */ +struct raft_uv_connect; +typedef void (*raft_uv_connect_cb)(struct raft_uv_connect *req, + struct uv_stream_s *stream, + int status); + +/** + * Handle to a connect request. + */ +struct raft_uv_connect +{ + void *data; /* User data */ + raft_uv_connect_cb cb; /* Callback */ +}; + +/** + * Callback invoked by the transport implementation after a close request is + * completed. + */ +typedef void (*raft_uv_transport_close_cb)(struct raft_uv_transport *t); + +/** + * Interface to establish outgoing connections to other Raft servers and to + * accept incoming connections from them. + */ + +struct raft_uv_transport +{ + /** + * Keep track of struct version, MUST be filled out by user. + * When moving to a new version, the user MUST implement the newly added + * methods. + * Latest version is 1. + */ + int version; + + /** + * User defined data. + */ + void *data; + + /** + * Implementation-defined state. + */ + void *impl; + + /** + * Human-readable message providing diagnostic information about the + * last error occurred. + */ + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + + /** + * Initialize the transport with the given server's identity. + */ + int (*init)(struct raft_uv_transport *t, + raft_id id, + const char *address); + + /** + * Start listening for incoming connections. + * + * Once a new connection is accepted, the @cb callback passed in the + * initializer must be invoked with the relevant details of the + * connecting Raft server. + */ + int (*listen)(struct raft_uv_transport *t, raft_uv_accept_cb cb); + + /** + * Connect to the server with the given ID and address. + * + * The @cb callback must be invoked when the connection has been + * established or the connection attempt has failed. The memory pointed + * by @req can be released only after @cb has fired. + */ + int (*connect)(struct raft_uv_transport *t, + struct raft_uv_connect *req, + raft_id id, + const char *address, + raft_uv_connect_cb cb); + + /** + * Close the transport. + * + * The implementation must: + * + * - Stop accepting incoming connections. The @cb callback passed to + * @listen must not be invoked anymore. + * + * - Cancel all pending @connect requests. + * + * - Invoke the @cb callback passed to this method once it's safe to + * release the memory of the transport object. + */ + void (*close)(struct raft_uv_transport *t, + raft_uv_transport_close_cb cb); +}; + +/** + * Init a transport interface that uses TCP sockets. + */ +RAFT_API int raft_uv_tcp_init(struct raft_uv_transport *t, + struct uv_loop_s *loop); + +/** + * Release any memory allocated internally. + */ +RAFT_API void raft_uv_tcp_close(struct raft_uv_transport *t); + +/** + * Set the IP address and port that the listening socket will bind to. + * + * By default the socket will bind to the address provided in + * raft_init(), which may be inconvenient if running your application in a + * container, for example. + * + * The @address argument must be an IPv4 dotted quad IP address and port, e.g. + * "0.0.0.0:8080". If you do not provide a port, the default of 8080 will be + * used. The port given here *must* match the port given to raft_init(). + * + * Must be called before raft_init(). + */ +RAFT_API int raft_uv_tcp_set_bind_address(struct raft_uv_transport *t, + const char *address); + +/** + * Raft cluster test fixture, using an in-memory @raft_io implementation. This + * is meant to be used in unit tests. + */ + +#define RAFT_FIXTURE_MAX_SERVERS 8 + +/** + * Fixture step event types. + */ +enum { + RAFT_FIXTURE_TICK = 1, /* The tick callback has been invoked */ + RAFT_FIXTURE_NETWORK, /* A network request has been sent or received */ + RAFT_FIXTURE_DISK, /* An I/O request has been submitted */ + RAFT_FIXTURE_WORK /* A large, CPU and/or memory intensive task */ +}; + +/** + * State of a single server in a cluster fixture. + */ +struct raft_fixture_server; + +/** + * Information about a test cluster event triggered by the fixture. + */ +struct raft_fixture_event; + +/** + * Returns the type of the event. + */ +int raft_fixture_event_type(struct raft_fixture_event *event); + +/** + * Returns the server index of the event. + */ +unsigned raft_fixture_event_server_index(struct raft_fixture_event *event); + +/** + * Event callback. See raft_fixture_hook(). + */ +struct raft_fixture; +typedef void (*raft_fixture_event_cb)(struct raft_fixture *f, + struct raft_fixture_event *event); + +/** + * Test implementation of a cluster of @n servers, each having a user-provided + * FSM. + * + * The cluster can simulate network latency and time elapsed on individual + * servers. + * + * Servers can be alive or dead. Network messages sent to dead servers are + * dropped. Dead servers do not have their @raft_io_tick_cb callback invoked. + * + * Any two servers can be connected or disconnected. Network messages sent + * between disconnected servers are dropped. + */ +struct raft_fixture +{ + raft_time time; /* Global time, common to all servers. */ + unsigned n; /* Number of servers. */ + raft_id leader_id; /* ID of current leader, or 0 if none. */ + struct raft_log *log; /* Copy of current leader's log. */ + raft_index commit_index; /* Current commit index on leader. */ + struct raft_fixture_event *event; /* Last event occurred. */ + raft_fixture_event_cb hook; /* Event callback. */ + struct raft_fixture_server *servers[RAFT_FIXTURE_MAX_SERVERS]; + uint64_t reserved[16]; /* For future expansion of struct. */ +}; + +/** + * Initialize a raft cluster fixture. Servers can be added by using + * `raft_fixture_grow`. + */ +RAFT_API int raft_fixture_init(struct raft_fixture *f); + +/** + * Release all memory used by the fixture. + */ +RAFT_API void raft_fixture_close(struct raft_fixture *f); + +/** + * Convenience to generate a configuration object containing all servers in the + * cluster. The first @n_voting servers will be voting ones. + */ +RAFT_API int raft_fixture_configuration(struct raft_fixture *f, + unsigned n_voting, + struct raft_configuration *conf); + +/** + * Convenience to bootstrap all servers in the cluster using the given + * configuration. + */ +RAFT_API int raft_fixture_bootstrap(struct raft_fixture *f, + struct raft_configuration *conf); + +/** + * Convenience to start all servers in the fixture. + */ +RAFT_API int raft_fixture_start(struct raft_fixture *f); + +/** + * Return the number of servers in the fixture. + */ +RAFT_API unsigned raft_fixture_n(struct raft_fixture *f); + +/** + * Return the current cluster global time. All raft instances see the same time. + */ +RAFT_API raft_time raft_fixture_time(struct raft_fixture *f); + +/** + * Return the raft instance associated with the @i'th server of the fixture. + */ +RAFT_API struct raft *raft_fixture_get(struct raft_fixture *f, unsigned i); + +/** + * Return @true if the @i'th server hasn't been killed. + */ +RAFT_API bool raft_fixture_alive(struct raft_fixture *f, unsigned i); + +/** + * Return the index of the current leader, or the current number of servers if + * there's no leader. + */ +RAFT_API unsigned raft_fixture_leader_index(struct raft_fixture *f); + +/** + * Return the ID of the server the @i'th server has voted for, or zero . + */ +RAFT_API raft_id raft_fixture_voted_for(struct raft_fixture *f, unsigned i); + +/** + * Drive the cluster so the @i'th server starts an election but doesn't + * necessarily win it. + * + * This is achieved by bumping the randomized election timeout of all other + * servers to a very high value, letting the one of the @i'th server expire. + * + * There must currently be no leader and no candidate and the given server must + * be a voting one. Also, the @i'th server must be connected to a majority of + * voting servers. + */ +RAFT_API void raft_fixture_start_elect(struct raft_fixture *f, unsigned i); + +/** + * Calls raft_fixture_start_elect, but waits and asserts that the @i'th server + * has become the leader. + */ +RAFT_API void raft_fixture_elect(struct raft_fixture *f, unsigned i); + +/** + * Drive the cluster so the current leader gets deposed. + * + * This is achieved by dropping all AppendEntries result messages sent by + * followers to the leader, until the leader decides to step down because it has + * lost connectivity to a majority of followers. + */ +RAFT_API void raft_fixture_depose(struct raft_fixture *f); + +/** + * Step through the cluster state advancing the time to the minimum value needed + * for it to make progress (i.e. for a message to be delivered, for an I/O + * operation to complete or for a single time tick to occur). + * + * In particular, the following happens: + * + * 1. If there are pending #raft_io_send requests, that have been submitted + * using #raft_io->send() and not yet sent, the oldest one is picked and the + * relevant callback fired. This simulates completion of a socket write, + * which means that the send request has been completed. The receiver does + * not immediately receives the message, as the message is propagating + * through the network. However any memory associated with the #raft_io_send + * request can be released (e.g. log entries). The in-memory I/O + * implementation assigns a latency to each RPC message, which will get + * delivered to the receiver only after that amount of time elapses. If the + * sender and the receiver are currently disconnected, the RPC message is + * simply dropped. If a callback was fired, jump directly to 3. and skip 2. + * + * 2. All pending #raft_io_append disk writes across all servers, that have been + * submitted using #raft_io->append() but not yet completed, are scanned and + * the one with the lowest completion time is picked. All in-flight network + * messages waiting to be delivered are scanned and the one with the lowest + * delivery time is picked. All servers are scanned, and the one with the + * lowest tick expiration time is picked. The three times are compared and + * the lowest one is picked. If a #raft_io_append disk write has completed, + * the relevant callback will be invoked, if there's a network message to be + * delivered, the receiver's @raft_io_recv_cb callback gets fired, if a tick + * timer has expired the relevant #raft_io->tick() callback will be + * invoked. Only one event will be fired. If there is more than one event to + * fire, one of them is picked according to the following rules: events for + * servers with lower index are fired first, tick events take precedence over + * disk events, and disk events take precedence over network events. + * + * 3. The current cluster leader is detected (if any). When detecting the leader + * the Election Safety property is checked: no servers can be in leader state + * for the same term. The server in leader state with the highest term is + * considered the current cluster leader, as long as it's "stable", i.e. it + * has been acknowledged by all servers connected to it, and those servers + * form a majority (this means that no further leader change can happen, + * unless the network gets disrupted). If there is a stable leader and it has + * not changed with respect to the previous call to @raft_fixture_step(), + * then the Leader Append-Only property is checked, by comparing its log with + * a copy of it that was taken during the previous iteration. + * + * 4. If there is a stable leader, its current log is copied, in order to be + * able to check the Leader Append-Only property at the next call. + * + * 5. If there is a stable leader, its commit index gets copied. + * + * The function returns information about which particular event occurred + * (either in step 1 or 2). + */ +RAFT_API struct raft_fixture_event *raft_fixture_step(struct raft_fixture *f); + +/** + * Call raft_fixture_step() exactly @n times, and return the last event fired. + */ +RAFT_API struct raft_fixture_event *raft_fixture_step_n(struct raft_fixture *f, + unsigned n); + +/** + * Step the cluster until the given @stop function returns #true, or @max_msecs + * have elapsed. + * + * Return #true if the @stop function has returned #true within @max_msecs. + */ +RAFT_API bool raft_fixture_step_until(struct raft_fixture *f, + bool (*stop)(struct raft_fixture *f, + void *arg), + void *arg, + unsigned max_msecs); + +/** + * Step the cluster until @msecs have elapsed. + */ +RAFT_API void raft_fixture_step_until_elapsed(struct raft_fixture *f, + unsigned msecs); + +/** + * Step the cluster until a leader is elected, or @max_msecs have elapsed. + */ +RAFT_API bool raft_fixture_step_until_has_leader(struct raft_fixture *f, + unsigned max_msecs); + +/** + * Step the cluster until the current leader gets deposed, or @max_msecs have + * elapsed. + */ +RAFT_API bool raft_fixture_step_until_has_no_leader(struct raft_fixture *f, + unsigned max_msecs); + +/** + * Step the cluster until the @i'th server has applied the entry at the given + * index, or @max_msecs have elapsed. If @i equals the number of servers, then + * step until all servers have applied the given entry. + */ +RAFT_API bool raft_fixture_step_until_applied(struct raft_fixture *f, + unsigned i, + raft_index index, + unsigned max_msecs); + +/** + * Step the cluster until the state of the @i'th server matches the given one, + * or @max_msecs have elapsed. + */ +RAFT_API bool raft_fixture_step_until_state_is(struct raft_fixture *f, + unsigned i, + int state, + unsigned max_msecs); + +/** + * Step the cluster until the term of the @i'th server matches the given one, + * or @max_msecs have elapsed. + */ +RAFT_API bool raft_fixture_step_until_term_is(struct raft_fixture *f, + unsigned i, + raft_term term, + unsigned max_msecs); + +/** + * Step the cluster until the @i'th server has voted for the @j'th one, or + * @max_msecs have elapsed. + */ +RAFT_API bool raft_fixture_step_until_voted_for(struct raft_fixture *f, + unsigned i, + unsigned j, + unsigned max_msecs); + +/** + * Step the cluster until all pending network messages from the @i'th server to + * the @j'th server have been delivered, or @max_msecs have elapsed. + */ +RAFT_API bool raft_fixture_step_until_delivered(struct raft_fixture *f, + unsigned i, + unsigned j, + unsigned max_msecs); + +/** + * Set a function to be called after every time a fixture event occurs as + * consequence of a step. + */ +RAFT_API void raft_fixture_hook(struct raft_fixture *f, + raft_fixture_event_cb hook); + +/** + * Disconnect the @i'th and the @j'th servers, so attempts to send a message + * from @i to @j will fail with #RAFT_NOCONNECTION. + */ +RAFT_API void raft_fixture_disconnect(struct raft_fixture *f, + unsigned i, + unsigned j); + +/** + * Reconnect the @i'th and the @j'th servers, so attempts to send a message + * from @i to @j will succeed again. + */ +RAFT_API void raft_fixture_reconnect(struct raft_fixture *f, + unsigned i, + unsigned j); + +/** + * Saturate the connection between the @i'th and the @j'th servers, so messages + * sent by @i to @j will be silently dropped. + */ +RAFT_API void raft_fixture_saturate(struct raft_fixture *f, + unsigned i, + unsigned j); + +/** + * Return true if the connection from the @i'th to the @j'th server has been set + * as saturated. + */ +RAFT_API bool raft_fixture_saturated(struct raft_fixture *f, + unsigned i, + unsigned j); + +/** + * Desaturate the connection between the @i'th and the @j'th servers, so + * messages sent by @i to @j will start being delivered again. + */ +RAFT_API void raft_fixture_desaturate(struct raft_fixture *f, + unsigned i, + unsigned j); + +/** + * Kill the server with the given index. The server won't receive any message + * and its tick callback won't be invoked. + */ +RAFT_API void raft_fixture_kill(struct raft_fixture *f, unsigned i); + +/** + * Revive a killed server with the given index. + */ +RAFT_API void raft_fixture_revive(struct raft_fixture *f, unsigned i); + +/** + * Add a new empty server to the cluster and connect it to all others. + */ +RAFT_API int raft_fixture_grow(struct raft_fixture *f, struct raft_fsm *fsm); + +/** + * Set the value that will be returned to the @i'th raft instance when it asks + * the underlying #raft_io implementation for a randomized election timeout + * value. The default value is 1000 + @i * 100, meaning that the election timer + * of server 0 will expire first. + */ +RAFT_API void raft_fixture_set_randomized_election_timeout( + struct raft_fixture *f, + unsigned i, + unsigned msecs); + +/** + * Set the network latency in milliseconds. Each RPC message sent by the @i'th + * server from now on will take @msecs milliseconds to be delivered. The default + * value is 15. + */ +RAFT_API void raft_fixture_set_network_latency(struct raft_fixture *f, + unsigned i, + unsigned msecs); + +/** + * Set the disk I/O latency in milliseconds. Each append request will take this + * amount of milliseconds to complete. The default value is 10. + */ +RAFT_API void raft_fixture_set_disk_latency(struct raft_fixture *f, + unsigned i, + unsigned msecs); + +/** + * Send the send latency in milliseconds. Each message send will take this many + * milliseconds before the send callback is invoked. + */ +RAFT_API void raft_fixture_set_send_latency(struct raft_fixture *f, + unsigned i, + unsigned j, + unsigned msecs); + +/** + * Set the persisted term of the @i'th server. + */ +RAFT_API void raft_fixture_set_term(struct raft_fixture *f, + unsigned i, + raft_term term); + +/** + * Set the most recent persisted snapshot on the @i'th server. + */ +RAFT_API void raft_fixture_set_snapshot(struct raft_fixture *f, + unsigned i, + struct raft_snapshot *snapshot); + +/** + * Add an entry to the persisted entries of the @i'th server. + */ +RAFT_API void raft_fixture_add_entry(struct raft_fixture *f, + unsigned i, + struct raft_entry *entry); + +RAFT_API void raft_fixture_append_fault(struct raft_fixture *f, + unsigned i, + int delay); + +RAFT_API void raft_fixture_vote_fault(struct raft_fixture *f, + unsigned i, + int delay); + +RAFT_API void raft_fixture_term_fault(struct raft_fixture *f, + unsigned i, + int delay); + +RAFT_API void raft_fixture_send_fault(struct raft_fixture *f, + unsigned i, + int delay); + +/** + * Return the number of messages of the given type that the @i'th server has + * successfully sent so far. + */ +RAFT_API unsigned raft_fixture_n_send(struct raft_fixture *f, + unsigned i, + int type); + +/** + * Return the number of messages of the given type that the @i'th server has + * received so far. + */ +RAFT_API unsigned raft_fixture_n_recv(struct raft_fixture *f, + unsigned i, + int type); + +/** + * Force the @i'th server into the UNAVAILABLE state. + */ +RAFT_API void raft_fixture_make_unavailable(struct raft_fixture *f, unsigned i); + +#endif /* RAFT_H */ diff --git a/src/raft/array.h b/src/raft/array.h new file mode 100644 index 000000000..711135cc6 --- /dev/null +++ b/src/raft/array.h @@ -0,0 +1,25 @@ +/* Macros to manipulate contiguous arrays. */ + +#ifndef ARRAY_H_ +#define ARRAY_H_ + +#include "../raft.h" + +/* Append item I of type T to array A which currently has N items. + * + * A and N must both by pointers. Set RV to -1 in case of failure. */ +#define ARRAY__APPEND(T, I, A, N, RV) \ + { \ + T *tmp_array; \ + tmp_array = raft_realloc(*A, (*N + 1) * sizeof **A); \ + if (tmp_array != NULL) { \ + (*N)++; \ + *A = tmp_array; \ + (*A)[(*N) - 1] = I; \ + RV = 0; \ + } else { \ + RV = -1; \ + } \ + } + +#endif /* ARRAY_H_ */ diff --git a/src/raft/assert.h b/src/raft/assert.h new file mode 100644 index 000000000..3bb77d1ce --- /dev/null +++ b/src/raft/assert.h @@ -0,0 +1,41 @@ +/* Define the assert() macro, either as the standard one or the test one. */ + +#ifndef ASSERT_H_ +#define ASSERT_H_ + +#if defined(RAFT_TEST) +extern void munit_errorf_ex(const char *filename, + int line, + const char *format, + ...); +#define assert(expr) \ + do { \ + if (!expr) { \ + munit_errorf_ex(__FILE__, __LINE__, \ + "assertion failed: ", #expr); \ + } \ + } while (0) +#elif defined(NDEBUG) +#define assert(x) \ + do { \ + (void)sizeof(x); \ + } while (0) +#elif defined(RAFT_ASSERT_WITH_BACKTRACE) +#include /* for __assert_fail */ +#include +#include +#undef assert +#define assert(x) \ + do { \ + struct backtrace_state *state_; \ + if (!(x)) { \ + state_ = backtrace_create_state(NULL, 0, NULL, NULL); \ + backtrace_print(state_, 0, stderr); \ + __assert_fail(#x, __FILE__, __LINE__, __func__); \ + } \ + } while (0) +#else +#include +#endif + +#endif /* ASSERT_H_ */ diff --git a/src/raft/byte.c b/src/raft/byte.c new file mode 100644 index 000000000..3fcd79ee8 --- /dev/null +++ b/src/raft/byte.c @@ -0,0 +1,374 @@ +#include "byte.h" + +/* Taken from https://github.com/gcc-mirror/gcc/blob/master/libiberty/crc32.c */ +static const unsigned byteCrcTable[] = { + 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, + 0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, + 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd, 0x4c11db70, 0x48d0c6c7, + 0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75, + 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, + 0x709f7b7a, 0x745e66cd, 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039, + 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58, 0xbaea46ef, + 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d, + 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, 0xc3f706fb, + 0xceb42022, 0xca753d95, 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1, + 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0, + 0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072, + 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4, + 0x0808d07d, 0x0cc9cdca, 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, + 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08, + 0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba, + 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc, + 0xb6238b25, 0xb2e29692, 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6, + 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a, 0xe0b41de7, 0xe4750050, + 0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2, + 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34, + 0xdc3abded, 0xd8fba05a, 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637, + 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, 0x4f040d56, 0x4bc510e1, + 0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53, + 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5, + 0x3f9b762c, 0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, + 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, 0xf12f560e, 0xf5ee4bb9, + 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b, + 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, + 0xcda1f604, 0xc960ebb3, 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, + 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71, + 0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3, + 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2, + 0x470cdd2b, 0x43cdc09c, 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8, + 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e, + 0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec, + 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a, + 0x2d15ebe3, 0x29d4f654, 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, + 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c, 0xe3a1cbc1, 0xe760d676, + 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4, + 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, + 0x933eb0bb, 0x97ffad0c, 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, + 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4}; + +unsigned byteCrc32(const void *buf, const size_t size, const unsigned init) +{ + unsigned crc = init; + uint8_t *cursor = (uint8_t *)buf; + size_t count = size; + + while (count--) { + crc = (crc << 8) ^ byteCrcTable[((crc >> 24) ^ *cursor) & 255]; + cursor++; + } + return crc; +} + +/* ================ sha1.c ================ */ +/* +SHA-1 in C +By Steve Reid +100% Public Domain + +Test Vectors (from FIPS PUB 180-1) +"abc" + A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D +"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" + 84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1 +A million repetitions of "a" + 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F +*/ + +/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */ +/* #define SHA1HANDSOFF * Copies data before messing with it. */ + +#define SHA1HANDSOFF + +#include +#include +#include /* for u_int*_t */ +#if defined(__sun) +#include "solarisfixes.h" +#endif + +#ifndef BYTE_ORDER +#if (BSD >= 199103) +#include +#else +#if defined(linux) || defined(__linux__) +#include +#else +#define LITTLE_ENDIAN 1234 /* least-significant byte first (vax, pc) */ +#define BIG_ENDIAN 4321 /* most-significant byte first (IBM, net) */ +#define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long (pdp)*/ + +#if defined(vax) || defined(ns32000) || defined(sun386) || \ + defined(__i386__) || defined(MIPSEL) || defined(_MIPSEL) || \ + defined(BIT_ZERO_ON_RIGHT) || defined(__alpha__) || defined(__alpha) +#define BYTE_ORDER LITTLE_ENDIAN +#endif + +#if defined(sel) || defined(pyr) || defined(mc68000) || defined(sparc) || \ + defined(is68k) || defined(tahoe) || defined(ibm032) || defined(ibm370) || \ + defined(MIPSEB) || defined(_MIPSEB) || defined(_IBMR2) || defined(DGUX) || \ + defined(apollo) || defined(__convex__) || defined(_CRAY) || \ + defined(__hppa) || defined(__hp9000) || defined(__hp9000s300) || \ + defined(__hp9000s700) || defined(BIT_ZERO_ON_LEFT) || defined(m68k) || \ + defined(__sparc) +#define BYTE_ORDER BIG_ENDIAN +#endif +#endif /* linux */ +#endif /* BSD */ +#endif /* BYTE_ORDER */ + +#if defined(__BYTE_ORDER) && !defined(BYTE_ORDER) +#if (__BYTE_ORDER == __LITTLE_ENDIAN) +#define BYTE_ORDER LITTLE_ENDIAN +#else +#define BYTE_ORDER BIG_ENDIAN +#endif +#endif + +#if !defined(BYTE_ORDER) || \ + (BYTE_ORDER != BIG_ENDIAN && BYTE_ORDER != LITTLE_ENDIAN && \ + BYTE_ORDER != PDP_ENDIAN) +/* you must determine what the correct bit order is for + * your compiler - the next line is an intentional error + * which will force your compiles to bomb until you fix + * the above macros. + */ +#error "Undefined or invalid BYTE_ORDER" +#endif + +#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) + +/* blk0() and blk() perform the initial expand. */ +/* I got the idea of expanding during the round function from SSLeay */ +#if BYTE_ORDER == LITTLE_ENDIAN +#define blk0(i) \ + (block->l[i] = (rol(block->l[i], 24) & 0xFF00FF00) | \ + (rol(block->l[i], 8) & 0x00FF00FF)) +#elif BYTE_ORDER == BIG_ENDIAN +#define blk0(i) block->l[i] +#else +#error "Endianness not defined!" +#endif +#define blk(i) \ + (block->l[i & 15] = \ + rol(block->l[(i + 13) & 15] ^ block->l[(i + 8) & 15] ^ \ + block->l[(i + 2) & 15] ^ block->l[i & 15], \ + 1)) + +/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ +#define R0(v, w, x, y, z, i) \ + z += ((w & (x ^ y)) ^ y) + blk0(i) + 0x5A827999 + rol(v, 5); \ + w = rol(w, 30); +#define R1(v, w, x, y, z, i) \ + z += ((w & (x ^ y)) ^ y) + blk(i) + 0x5A827999 + rol(v, 5); \ + w = rol(w, 30); +#define R2(v, w, x, y, z, i) \ + z += (w ^ x ^ y) + blk(i) + 0x6ED9EBA1 + rol(v, 5); \ + w = rol(w, 30); +#define R3(v, w, x, y, z, i) \ + z += (((w | x) & y) | (w & x)) + blk(i) + 0x8F1BBCDC + rol(v, 5); \ + w = rol(w, 30); +#define R4(v, w, x, y, z, i) \ + z += (w ^ x ^ y) + blk(i) + 0xCA62C1D6 + rol(v, 5); \ + w = rol(w, 30); + +static void byteSha1Transform(uint32_t state[5], const uint8_t buffer[64]) +{ + uint32_t a, b, c, d, e; + typedef union { + uint8_t c[64]; + uint32_t l[16]; + } CHAR64LONG16; +#ifdef SHA1HANDSOFF + CHAR64LONG16 block[1]; /* use array to appear as a pointer */ + memcpy(block, buffer, 64); +#else + /* The following had better never be used because it causes the + * pointer-to-const buffer to be cast into a pointer to non-const. + * And the result is written through. I threw a "const" in, hoping + * this will cause a diagnostic. + */ + CHAR64LONG16 *block = (const CHAR64LONG16 *)buffer; +#endif + /* Copy context->state[] to working vars */ + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + /* 4 rounds of 20 operations each. Loop unrolled. */ + R0(a, b, c, d, e, 0); + R0(e, a, b, c, d, 1); + R0(d, e, a, b, c, 2); + R0(c, d, e, a, b, 3); + R0(b, c, d, e, a, 4); + R0(a, b, c, d, e, 5); + R0(e, a, b, c, d, 6); + R0(d, e, a, b, c, 7); + R0(c, d, e, a, b, 8); + R0(b, c, d, e, a, 9); + R0(a, b, c, d, e, 10); + R0(e, a, b, c, d, 11); + R0(d, e, a, b, c, 12); + R0(c, d, e, a, b, 13); + R0(b, c, d, e, a, 14); + R0(a, b, c, d, e, 15); + R1(e, a, b, c, d, 16); + R1(d, e, a, b, c, 17); + R1(c, d, e, a, b, 18); + R1(b, c, d, e, a, 19); + R2(a, b, c, d, e, 20); + R2(e, a, b, c, d, 21); + R2(d, e, a, b, c, 22); + R2(c, d, e, a, b, 23); + R2(b, c, d, e, a, 24); + R2(a, b, c, d, e, 25); + R2(e, a, b, c, d, 26); + R2(d, e, a, b, c, 27); + R2(c, d, e, a, b, 28); + R2(b, c, d, e, a, 29); + R2(a, b, c, d, e, 30); + R2(e, a, b, c, d, 31); + R2(d, e, a, b, c, 32); + R2(c, d, e, a, b, 33); + R2(b, c, d, e, a, 34); + R2(a, b, c, d, e, 35); + R2(e, a, b, c, d, 36); + R2(d, e, a, b, c, 37); + R2(c, d, e, a, b, 38); + R2(b, c, d, e, a, 39); + R3(a, b, c, d, e, 40); + R3(e, a, b, c, d, 41); + R3(d, e, a, b, c, 42); + R3(c, d, e, a, b, 43); + R3(b, c, d, e, a, 44); + R3(a, b, c, d, e, 45); + R3(e, a, b, c, d, 46); + R3(d, e, a, b, c, 47); + R3(c, d, e, a, b, 48); + R3(b, c, d, e, a, 49); + R3(a, b, c, d, e, 50); + R3(e, a, b, c, d, 51); + R3(d, e, a, b, c, 52); + R3(c, d, e, a, b, 53); + R3(b, c, d, e, a, 54); + R3(a, b, c, d, e, 55); + R3(e, a, b, c, d, 56); + R3(d, e, a, b, c, 57); + R3(c, d, e, a, b, 58); + R3(b, c, d, e, a, 59); + R4(a, b, c, d, e, 60); + R4(e, a, b, c, d, 61); + R4(d, e, a, b, c, 62); + R4(c, d, e, a, b, 63); + R4(b, c, d, e, a, 64); + R4(a, b, c, d, e, 65); + R4(e, a, b, c, d, 66); + R4(d, e, a, b, c, 67); + R4(c, d, e, a, b, 68); + R4(b, c, d, e, a, 69); + R4(a, b, c, d, e, 70); + R4(e, a, b, c, d, 71); + R4(d, e, a, b, c, 72); + R4(c, d, e, a, b, 73); + R4(b, c, d, e, a, 74); + R4(a, b, c, d, e, 75); + R4(e, a, b, c, d, 76); + R4(d, e, a, b, c, 77); + R4(c, d, e, a, b, 78); + R4(b, c, d, e, a, 79); + /* Add the working vars back into context.state[] */ + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + /* Wipe variables */ + a = b = c = d = e = 0; +#ifdef SHA1HANDSOFF + memset(block, '\0', sizeof(block)); +#endif +} + +void byteSha1Init(struct byteSha1 *s) +{ + /* SHA1 initialization constants */ + s->state[0] = 0x67452301; + s->state[1] = 0xEFCDAB89; + s->state[2] = 0x98BADCFE; + s->state[3] = 0x10325476; + s->state[4] = 0xC3D2E1F0; + s->count[0] = s->count[1] = 0; +} + +/* Run your data through this. */ +void __attribute__((noinline)) +byteSha1Update(struct byteSha1 *s, const uint8_t *data, uint32_t len) +{ + uint32_t i; + uint32_t j; + + j = s->count[0]; + if ((s->count[0] += len << 3) < j) + s->count[1]++; + s->count[1] += (len >> 29); + j = (j >> 3) & 63; + if ((j + len) > 63) { + memcpy(&s->buffer[j], data, (i = 64 - j)); + byteSha1Transform(s->state, s->buffer); + for (; i + 63 < len; i += 64) { + byteSha1Transform(s->state, &data[i]); + } + j = 0; + } else + i = 0; + memcpy(&s->buffer[j], &data[i], len - i); +} + +/* Add padding and return the message digest. */ + +void byteSha1Digest(struct byteSha1 *s, uint8_t value[20]) +{ + unsigned i; + uint8_t finalcount[8]; + uint8_t c; + +#if 0 /* untested "improvement" by DHR */ + /* Convert context->count to a sequence of bytes + * in finalcount. Second element first, but + * big-endian order within element. + * But we do it all backwards. + */ + uint8_t *fcp = &finalcount[8]; + + for (i = 0; i < 2; i++) + { + u_int32_t t = context->count[i]; + int j; + + for (j = 0; j < 4; t >>= 8, j++) + *--fcp = (uint8_t) t + } +#else + for (i = 0; i < 8; i++) { + finalcount[i] = (uint8_t)((s->count[(i >= 4 ? 0 : 1)] >> + ((3 - (i & 3)) * 8)) & + 255); /* Endian independent */ + } +#endif + c = 0200; + byteSha1Update(s, &c, 1); + while ((s->count[0] & 504) != 448) { + c = 0000; + byteSha1Update(s, &c, 1); + } + byteSha1Update(s, finalcount, 8); /* Should cause a SHA1Transform() */ + for (i = 0; i < 20; i++) { + value[i] = + (uint8_t)((s->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255); + } + /* Wipe variables */ + memset(s, '\0', sizeof(*s)); + memset(&finalcount, '\0', sizeof(finalcount)); +} + +/* ================ end of sha1.c ================ */ diff --git a/src/raft/byte.h b/src/raft/byte.h new file mode 100644 index 000000000..ba213e914 --- /dev/null +++ b/src/raft/byte.h @@ -0,0 +1,182 @@ +/* Byte-level utilities. */ + +#ifndef BYTE_H_ +#define BYTE_H_ + +#include +#include +#include + +#if defined(__cplusplus) +#define BYTE__INLINE inline +#else +#if defined(__clang__) +#define BYTE__INLINE static inline __attribute__((unused)) +#else +#define BYTE__INLINE static inline +#endif +#endif + +/* Compile-time endianess detection (best effort). */ +#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \ + (defined(__ARMEL__) && (__ARMEL__ == 1)) +#define BYTE__LITTLE_ENDIAN +#elif defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN) && \ + defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 +#define RAFT__BIG_ENDIAN +#endif + +/* Flip a 32-bit number to network byte order (little endian) */ +BYTE__INLINE uint32_t byteFlip32(uint32_t v) +{ +#if defined(BYTE__LITTLE_ENDIAN) + return v; +#elif defined(RAFT__BIG_ENDIAN) + return __builtin_bswap32(v); +#else /* Unknown endianess */ + union { + uint32_t u; + uint8_t v[4]; + } s; + + s.v[0] = (uint8_t)v; + s.v[1] = (uint8_t)(v >> 8); + s.v[2] = (uint8_t)(v >> 16); + s.v[3] = (uint8_t)(v >> 24); + + return s.u; +#endif +} + +/* Flip a 64-bit number to network byte order (little endian) */ +BYTE__INLINE uint64_t byteFlip64(uint64_t v) +{ +#if defined(BYTE__LITTLE_ENDIAN) + return v; +#elif defined(RAFT__BIG_ENDIAN) + return __builtin_bswap64(v); +#else + union { + uint64_t u; + uint8_t v[8]; + } s; + + s.v[0] = (uint8_t)v; + s.v[1] = (uint8_t)(v >> 8); + s.v[2] = (uint8_t)(v >> 16); + s.v[3] = (uint8_t)(v >> 24); + s.v[4] = (uint8_t)(v >> 32); + s.v[5] = (uint8_t)(v >> 40); + s.v[6] = (uint8_t)(v >> 48); + s.v[7] = (uint8_t)(v >> 56); + + return s.u; +#endif +} + +BYTE__INLINE void bytePut8(void **cursor, uint8_t value) +{ + uint8_t **p = (uint8_t **)cursor; + **p = value; + *p += 1; +} + +BYTE__INLINE void bytePut32(void **cursor, uint32_t value) +{ + unsigned i; + uint32_t flipped = byteFlip32(value); + for (i = 0; i < sizeof(uint32_t); i++) { + bytePut8(cursor, ((uint8_t *)(&flipped))[i]); + } +} + +BYTE__INLINE void bytePut64(void **cursor, uint64_t value) +{ + unsigned i; + uint64_t flipped = byteFlip64(value); + for (i = 0; i < sizeof(uint64_t); i++) { + bytePut8(cursor, ((uint8_t *)(&flipped))[i]); + } +} + +BYTE__INLINE void bytePutString(void **cursor, const char *value) +{ + char **p = (char **)cursor; + strcpy(*p, value); + *p += strlen(value) + 1; +} + +BYTE__INLINE uint8_t byteGet8(const void **cursor) +{ + const uint8_t **p = (const uint8_t **)cursor; + uint8_t value = **p; + *p += 1; + return value; +} + +BYTE__INLINE uint32_t byteGet32(const void **cursor) +{ + uint32_t value = 0; + unsigned i; + for (i = 0; i < sizeof(uint32_t); i++) { + ((uint8_t *)(&value))[i] = byteGet8(cursor); + } + return byteFlip32(value); +} + +BYTE__INLINE uint64_t byteGet64(const void **cursor) +{ + uint64_t value = 0; + unsigned i; + for (i = 0; i < sizeof(uint64_t); i++) { + ((uint8_t *)(&value))[i] = byteGet8(cursor); + } + return byteFlip64(value); +} + +BYTE__INLINE const char *byteGetString(const void **cursor, size_t max_len) +{ + const char **p = (const char **)cursor; + const char *value = *p; + size_t len = 0; + while (len < max_len) { + if (*(*p + len) == 0) { + break; + } + len++; + } + if (len == max_len) { + return NULL; + } + *p += len + 1; + return value; +} + +/* Add padding to size if it's not a multiple of 8. */ +BYTE__INLINE size_t bytePad64(size_t size) +{ + size_t rest = size % sizeof(uint64_t); + + if (rest != 0) { + size += sizeof(uint64_t) - rest; + } + + return size; +} + +/* Calculate the CRC32 checksum of the given data buffer. */ +unsigned byteCrc32(const void *buf, size_t size, unsigned init); + +struct byteSha1 +{ + uint32_t state[5]; + uint32_t count[2]; + uint8_t buffer[64]; + uint8_t value[20]; +}; + +void byteSha1Init(struct byteSha1 *s); +void byteSha1Update(struct byteSha1 *s, const uint8_t *data, uint32_t len); +void byteSha1Digest(struct byteSha1 *s, uint8_t value[20]); + +#endif /* BYTE_H_ */ diff --git a/src/raft/callbacks.c b/src/raft/callbacks.c new file mode 100644 index 000000000..5f58ee21a --- /dev/null +++ b/src/raft/callbacks.c @@ -0,0 +1,24 @@ +#include "callbacks.h" +#include "heap.h" + +int raftInitCallbacks(struct raft *r) +{ + r->callbacks = 0; + struct raft_callbacks *cbs = RaftHeapCalloc(1, sizeof(*cbs)); + if (cbs == NULL) { + return RAFT_NOMEM; + } + r->callbacks = (uint64_t)(uintptr_t)cbs; + return 0; +} + +void raftDestroyCallbacks(struct raft *r) +{ + RaftHeapFree((void *)(uintptr_t)r->callbacks); + r->callbacks = 0; +} + +struct raft_callbacks *raftGetCallbacks(struct raft *r) +{ + return (void *)(uintptr_t)r->callbacks; +} diff --git a/src/raft/callbacks.h b/src/raft/callbacks.h new file mode 100644 index 000000000..e756b3070 --- /dev/null +++ b/src/raft/callbacks.h @@ -0,0 +1,15 @@ +#ifndef CALLBACKS_H_ +#define CALLBACKS_H_ + +#include "../raft.h" + +struct raft_callbacks +{ + raft_state_cb state_cb; +}; + +int raftInitCallbacks(struct raft *r); +void raftDestroyCallbacks(struct raft *r); +struct raft_callbacks *raftGetCallbacks(struct raft *r); + +#endif diff --git a/src/raft/client.c b/src/raft/client.c new file mode 100644 index 000000000..cd88f1d2b --- /dev/null +++ b/src/raft/client.c @@ -0,0 +1,455 @@ +#include "../raft.h" +#include "../tracing.h" +#include "assert.h" +#include "configuration.h" +#include "err.h" +#include "lifecycle.h" +#include "log.h" +#include "membership.h" +#include "progress.h" +#include "queue.h" +#include "replication.h" +#include "request.h" + +int raft_apply(struct raft *r, + struct raft_apply *req, + const struct raft_buffer bufs[], + const unsigned n, + raft_apply_cb cb) +{ + raft_index index; + int rv; + + tracef("raft_apply n %d", n); + + assert(r != NULL); + assert(bufs != NULL); + assert(n > 0); + + if (r->state != RAFT_LEADER || r->transfer != NULL) { + rv = RAFT_NOTLEADER; + ErrMsgFromCode(r->errmsg, rv); + tracef("raft_apply not leader"); + goto err; + } + + /* Index of the first entry being appended. */ + index = logLastIndex(r->log) + 1; + tracef("%u commands starting at %lld", n, index); + req->type = RAFT_COMMAND; + req->index = index; + req->cb = cb; + + /* Append the new entries to the log. */ + rv = logAppendCommands(r->log, r->current_term, bufs, n); + if (rv != 0) { + goto err; + } + + lifecycleRequestStart(r, (struct request *)req); + + rv = replicationTrigger(r, index); + if (rv != 0) { + goto err_after_log_append; + } + + return 0; + +err_after_log_append: + logDiscard(r->log, index); + QUEUE_REMOVE(&req->queue); +err: + assert(rv != 0); + return rv; +} + +int raft_barrier(struct raft *r, struct raft_barrier *req, raft_barrier_cb cb) +{ + raft_index index; + struct raft_buffer buf; + int rv; + + if (r->state != RAFT_LEADER || r->transfer != NULL) { + rv = RAFT_NOTLEADER; + goto err; + } + + /* TODO: use a completely empty buffer */ + buf.len = 8; + buf.base = raft_malloc(buf.len); + + if (buf.base == NULL) { + rv = RAFT_NOMEM; + goto err; + } + + /* Index of the barrier entry being appended. */ + index = logLastIndex(r->log) + 1; + tracef("barrier starting at %lld", index); + req->type = RAFT_BARRIER; + req->index = index; + req->cb = cb; + + rv = logAppend(r->log, r->current_term, RAFT_BARRIER, &buf, NULL); + if (rv != 0) { + goto err_after_buf_alloc; + } + + lifecycleRequestStart(r, (struct request *)req); + + rv = replicationTrigger(r, index); + if (rv != 0) { + goto err_after_log_append; + } + + return 0; + +err_after_log_append: + logDiscard(r->log, index); + QUEUE_REMOVE(&req->queue); +err_after_buf_alloc: + raft_free(buf.base); +err: + return rv; +} + +static int clientChangeConfiguration( + struct raft *r, + struct raft_change *req, + const struct raft_configuration *configuration) +{ + raft_index index; + raft_term term = r->current_term; + int rv; + + (void)req; + + /* Index of the entry being appended. */ + index = logLastIndex(r->log) + 1; + + /* Encode the new configuration and append it to the log. */ + rv = logAppendConfiguration(r->log, term, configuration); + if (rv != 0) { + goto err; + } + + if (configuration->n != r->configuration.n) { + rv = progressRebuildArray(r, configuration); + if (rv != 0) { + goto err; + } + } + + /* Update the current configuration if we've created a new object. */ + if (configuration != &r->configuration) { + raft_configuration_close(&r->configuration); + r->configuration = *configuration; + } + + /* Start writing the new log entry to disk and send it to the followers. + */ + rv = replicationTrigger(r, index); + if (rv != 0) { + /* TODO: restore the old next/match indexes and configuration. + */ + goto err_after_log_append; + } + + r->configuration_uncommitted_index = index; + + return 0; + +err_after_log_append: + logTruncate(r->log, index); + +err: + assert(rv != 0); + return rv; +} + +int raft_add(struct raft *r, + struct raft_change *req, + raft_id id, + const char *address, + raft_change_cb cb) +{ + struct raft_configuration configuration; + int rv; + + rv = membershipCanChangeConfiguration(r); + if (rv != 0) { + return rv; + } + + tracef("add server: id %llu, address %s", id, address); + + /* Make a copy of the current configuration, and add the new server to + * it. */ + rv = configurationCopy(&r->configuration, &configuration); + if (rv != 0) { + goto err; + } + + rv = raft_configuration_add(&configuration, id, address, RAFT_SPARE); + if (rv != 0) { + goto err_after_configuration_copy; + } + + req->cb = cb; + + rv = clientChangeConfiguration(r, req, &configuration); + if (rv != 0) { + goto err_after_configuration_copy; + } + + assert(r->leader_state.change == NULL); + r->leader_state.change = req; + + return 0; + +err_after_configuration_copy: + raft_configuration_close(&configuration); +err: + assert(rv != 0); + return rv; +} + +int raft_assign(struct raft *r, + struct raft_change *req, + raft_id id, + int role, + raft_change_cb cb) +{ + const struct raft_server *server; + unsigned server_index; + raft_index last_index; + int rv; + + tracef("raft_assign to id:%llu the role:%d", id, role); + if (role != RAFT_STANDBY && role != RAFT_VOTER && role != RAFT_SPARE) { + rv = RAFT_BADROLE; + ErrMsgFromCode(r->errmsg, rv); + return rv; + } + + rv = membershipCanChangeConfiguration(r); + if (rv != 0) { + return rv; + } + + server = configurationGet(&r->configuration, id); + if (server == NULL) { + rv = RAFT_NOTFOUND; + ErrMsgPrintf(r->errmsg, "no server has ID %llu", id); + goto err; + } + + /* Check if we have already the desired role. */ + if (server->role == role) { + const char *name; + rv = RAFT_BADROLE; + switch (role) { + case RAFT_VOTER: + name = "voter"; + break; + case RAFT_STANDBY: + name = "stand-by"; + break; + case RAFT_SPARE: + name = "spare"; + break; + default: + name = NULL; + assert(0); + break; + } + ErrMsgPrintf(r->errmsg, "server is already %s", name); + goto err; + } + + server_index = configurationIndexOf(&r->configuration, id); + assert(server_index < r->configuration.n); + + last_index = logLastIndex(r->log); + + req->cb = cb; + + assert(r->leader_state.change == NULL); + r->leader_state.change = req; + + /* If we are not promoting to the voter role or if the log of this + * server is already up-to-date, we can submit the configuration change + * immediately. */ + if (role != RAFT_VOTER || + progressMatchIndex(r, server_index) == last_index) { + int old_role = r->configuration.servers[server_index].role; + r->configuration.servers[server_index].role = role; + + rv = clientChangeConfiguration(r, req, &r->configuration); + if (rv != 0) { + tracef("clientChangeConfiguration failed %d", rv); + r->configuration.servers[server_index].role = old_role; + return rv; + } + + return 0; + } + + r->leader_state.promotee_id = server->id; + + /* Initialize the first catch-up round. */ + r->leader_state.round_number = 1; + r->leader_state.round_index = last_index; + r->leader_state.round_start = r->io->time(r->io); + + /* Immediately initiate an AppendEntries request. */ + rv = replicationProgress(r, server_index); + if (rv != 0 && rv != RAFT_NOCONNECTION) { + /* This error is not fatal. */ + tracef("failed to send append entries to server %llu: %s (%d)", + server->id, raft_strerror(rv), rv); + } + + return 0; + +err: + assert(rv != 0); + return rv; +} + +int raft_remove(struct raft *r, + struct raft_change *req, + raft_id id, + raft_change_cb cb) +{ + const struct raft_server *server; + struct raft_configuration configuration; + int rv; + + rv = membershipCanChangeConfiguration(r); + if (rv != 0) { + return rv; + } + + server = configurationGet(&r->configuration, id); + if (server == NULL) { + rv = RAFT_BADID; + goto err; + } + + tracef("remove server: id %llu", id); + + /* Make a copy of the current configuration, and remove the given server + * from it. */ + rv = configurationCopy(&r->configuration, &configuration); + if (rv != 0) { + goto err; + } + + rv = configurationRemove(&configuration, id); + if (rv != 0) { + goto err_after_configuration_copy; + } + + req->cb = cb; + + rv = clientChangeConfiguration(r, req, &configuration); + if (rv != 0) { + goto err_after_configuration_copy; + } + + assert(r->leader_state.change == NULL); + r->leader_state.change = req; + + return 0; + +err_after_configuration_copy: + raft_configuration_close(&configuration); + +err: + assert(rv != 0); + return rv; +} + +/* Find a suitable voting follower. */ +static raft_id clientSelectTransferee(struct raft *r) +{ + const struct raft_server *transferee = NULL; + unsigned i; + + for (i = 0; i < r->configuration.n; i++) { + const struct raft_server *server = &r->configuration.servers[i]; + if (server->id == r->id || server->role != RAFT_VOTER) { + continue; + } + transferee = server; + if (progressIsUpToDate(r, i)) { + break; + } + } + + if (transferee != NULL) { + return transferee->id; + } + + return 0; +} + +int raft_transfer(struct raft *r, + struct raft_transfer *req, + raft_id id, + raft_transfer_cb cb) +{ + const struct raft_server *server; + unsigned i; + int rv; + + tracef("transfer to %llu", id); + if (r->state != RAFT_LEADER || r->transfer != NULL) { + tracef("transfer error - state:%d", r->state); + rv = RAFT_NOTLEADER; + ErrMsgFromCode(r->errmsg, rv); + goto err; + } + + if (id == 0) { + id = clientSelectTransferee(r); + if (id == 0) { + rv = RAFT_NOTFOUND; + ErrMsgPrintf(r->errmsg, + "there's no other voting server"); + goto err; + } + } + + server = configurationGet(&r->configuration, id); + if (server == NULL || server->id == r->id || + server->role != RAFT_VOTER) { + rv = RAFT_BADID; + ErrMsgFromCode(r->errmsg, rv); + goto err; + } + + /* If this follower is up-to-date, we can send it the TimeoutNow message + * right away. */ + i = configurationIndexOf(&r->configuration, server->id); + assert(i < r->configuration.n); + + membershipLeadershipTransferInit(r, req, id, cb); + + if (progressPersistedIsUpToDate(r, i)) { + rv = membershipLeadershipTransferStart(r); + if (rv != 0) { + r->transfer = NULL; + goto err; + } + } + + return 0; + +err: + assert(rv != 0); + return rv; +} + +#undef tracef diff --git a/src/raft/compress.c b/src/raft/compress.c new file mode 100644 index 000000000..5297f4cd9 --- /dev/null +++ b/src/raft/compress.c @@ -0,0 +1,277 @@ +#include "compress.h" + +#ifdef LZ4_AVAILABLE +#include +#endif +#include +#include + +#include "assert.h" +#include "byte.h" +#include "err.h" + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define max(a, b) ((a) > (b) ? (a) : (b)) +#define MEGABYTE 1048576 + +int Compress(struct raft_buffer bufs[], + unsigned n_bufs, + struct raft_buffer *compressed, + char *errmsg) +{ +#ifndef LZ4_AVAILABLE + (void)bufs; + (void)n_bufs; + (void)compressed; + ErrMsgPrintf(errmsg, "LZ4 not available"); + return RAFT_INVALID; +#else + assert(bufs != NULL); + assert(n_bufs > 0); + assert(compressed != NULL); + assert(errmsg != NULL); + + int rv = RAFT_IOERR; + size_t src_size = 0; + size_t dst_size = 0; + size_t src_offset = 0; + size_t dst_offset = 0; + size_t dst_size_needed = 0; /* Store minimal dst_size */ + size_t ret = 0; /* Return value of LZ4F_XXX functions */ + compressed->base = NULL; + compressed->len = 0; + + /* Determine total uncompressed size */ + for (unsigned i = 0; i < n_bufs; ++i) { + src_size += bufs[i].len; + } + + /* Work around a bug in liblz4 on bionic, in practice raft should only + * Compress non-0 length buffers, so this should be fine. + * https://github.com/lz4/lz4/issues/157 + * */ + if (src_size == 0) { + ErrMsgPrintf(errmsg, "total size must be larger then 0"); + rv = RAFT_INVALID; + goto err; + } + + /* Set LZ4 preferences */ + LZ4F_preferences_t lz4_pref; + memset(&lz4_pref, 0, sizeof(lz4_pref)); + /* Detect data corruption when decompressing */ + lz4_pref.frameInfo.contentChecksumFlag = 1; + /* For allocating a suitable buffer when decompressing */ + lz4_pref.frameInfo.contentSize = src_size; + + /* Context to track compression progress */ + LZ4F_compressionContext_t ctx; + ret = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION); + if (LZ4F_isError(ret)) { + ErrMsgPrintf(errmsg, "LZ4F_createDecompressionContext %s", + LZ4F_getErrorName(ret)); + rv = RAFT_NOMEM; + goto err; + } + + /* Guestimate of eventual compressed size, mainly not to allocate a huge + * buffer as `LZ4F_compressBound` calculates the worst case scenario. */ + dst_size = LZ4F_compressBound( + max(MEGABYTE, (size_t)lz4_pref.frameInfo.contentSize / 10), + &lz4_pref); + dst_size += LZ4F_HEADER_SIZE_MAX_RAFT; + compressed->base = raft_malloc(dst_size); + if (compressed->base == NULL) { + rv = RAFT_NOMEM; + goto err_after_ctx_alloc; + } + + /* Returns the size of the lz4 header, data should be written after the + * header */ + dst_offset = + LZ4F_compressBegin(ctx, compressed->base, dst_size, &lz4_pref); + if (LZ4F_isError(dst_offset)) { + ErrMsgPrintf(errmsg, "LZ4F_compressBegin %s", + LZ4F_getErrorName(dst_offset)); + rv = RAFT_IOERR; + goto err_after_buff_alloc; + } + + /* Compress all buffers */ + for (unsigned i = 0; i < n_bufs; ++i) { + src_offset = 0; + while (src_offset < bufs[i].len) { + /* Compress in chunks of maximum 1MB and check if there + * is enough room in the dst buffer, if not realloc */ + src_size = + min(bufs[i].len - src_offset, (size_t)MEGABYTE); + dst_size_needed = + LZ4F_compressBound(src_size, &lz4_pref); + if (dst_size - dst_offset < dst_size_needed) { + dst_size += + max(dst_size_needed, + (size_t)lz4_pref.frameInfo.contentSize / + 10); + compressed->base = + raft_realloc(compressed->base, dst_size); + if (compressed->base == NULL) { + rv = RAFT_NOMEM; + goto err_after_ctx_alloc; + } + } + /* There is guaranteed enough room in `dst` to perform + * the compression */ + ret = LZ4F_compressUpdate( + ctx, (char *)compressed->base + dst_offset, + dst_size - dst_offset, + (char *)bufs[i].base + src_offset, src_size, NULL); + if (LZ4F_isError(ret)) { + ErrMsgPrintf(errmsg, "LZ4F_compressUpdate %s", + LZ4F_getErrorName(ret)); + rv = RAFT_IOERR; + goto err_after_buff_alloc; + } + dst_offset += ret; + src_offset += src_size; + } + } + + /* Make sure LZ4F_compressEnd has enough room to succeed */ + dst_size_needed = LZ4F_compressBound(0, &lz4_pref); + if ((dst_size - dst_offset) < dst_size_needed) { + dst_size += dst_size_needed; + compressed->base = raft_realloc(compressed->base, dst_size); + if (compressed->base == NULL) { + rv = RAFT_NOMEM; + goto err_after_ctx_alloc; + } + } + + /* Finalize compression */ + ret = LZ4F_compressEnd(ctx, (char *)compressed->base + dst_offset, + dst_size - dst_offset, NULL); + if (LZ4F_isError(ret)) { + ErrMsgPrintf(errmsg, "LZ4F_compressEnd %s", + LZ4F_getErrorName(ret)); + rv = RAFT_IOERR; + goto err_after_buff_alloc; + } + + dst_offset += ret; + compressed->len = dst_offset; + + LZ4F_freeCompressionContext(ctx); + return 0; + +err_after_buff_alloc: + raft_free(compressed->base); + compressed->base = NULL; +err_after_ctx_alloc: + LZ4F_freeCompressionContext(ctx); +err: + return rv; +#endif /* LZ4_AVAILABLE */ +} + +int Decompress(struct raft_buffer buf, + struct raft_buffer *decompressed, + char *errmsg) +{ +#ifndef LZ4_AVAILABLE + (void)buf; + (void)decompressed; + ErrMsgPrintf(errmsg, "LZ4 not available"); + return RAFT_INVALID; +#else + assert(decompressed != NULL); + + int rv = RAFT_IOERR; + size_t src_offset = 0; + size_t dst_offset = 0; + size_t src_size = 0; + size_t dst_size = 0; + size_t ret = 0; + + LZ4F_decompressionContext_t ctx; + if (LZ4F_isError(LZ4F_createDecompressionContext(&ctx, LZ4F_VERSION))) { + ErrMsgPrintf(errmsg, "LZ4F_createDecompressionContext"); + rv = RAFT_NOMEM; + goto err; + } + + src_size = buf.len; + LZ4F_frameInfo_t frameInfo = {0}; + /* `src_size` will contain the size of the LZ4 Frame Header after the + * call, decompression must resume at that offset. */ + ret = LZ4F_getFrameInfo(ctx, &frameInfo, buf.base, &src_size); + if (LZ4F_isError(ret)) { + ErrMsgPrintf(errmsg, "LZ4F_getFrameInfo %s", + LZ4F_getErrorName(ret)); + rv = RAFT_IOERR; + goto err_after_ctx_alloc; + } + src_offset = src_size; + + decompressed->base = raft_malloc((size_t)frameInfo.contentSize); + decompressed->len = (size_t)frameInfo.contentSize; + if (decompressed->base == NULL) { + rv = RAFT_NOMEM; + goto err_after_ctx_alloc; + } + + ret = 1; + while (ret != 0) { + src_size = buf.len - src_offset; + /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + * The next line works around a bug in an older lz4 lib where + * the `size_t` dst_size parameter would overflow an `int`. + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + */ + dst_size = min(decompressed->len - dst_offset, (size_t)INT_MAX); + /* `dst_size` will contain the number of bytes written to + * decompressed->base, while `src_size` will contain the number + * of bytes consumed from buf.base */ + ret = LZ4F_decompress( + ctx, (char *)decompressed->base + dst_offset, &dst_size, + (char *)buf.base + src_offset, &src_size, NULL); + if (LZ4F_isError(ret)) { + ErrMsgPrintf(errmsg, "LZ4F_decompress %s", + LZ4F_getErrorName(ret)); + rv = RAFT_IOERR; + goto err_after_buff_alloc; + } + src_offset += src_size; + dst_offset += dst_size; + } + + if (LZ4F_freeDecompressionContext(ctx) != 0) { + raft_free(decompressed->base); + decompressed->base = NULL; + return RAFT_IOERR; + } + + return 0; + +err_after_buff_alloc: + raft_free(decompressed->base); + decompressed->base = NULL; +err_after_ctx_alloc: + LZ4F_freeDecompressionContext(ctx); +err: + return rv; +#endif /* LZ4_AVAILABLE */ +} + +bool IsCompressed(const void *data, size_t sz) +{ + if (data == NULL || sz < 4) { + return false; + } + const void *cursor = data; +#ifdef LZ4F_MAGICNUMBER +#define RAFT_LZ4F_MAGICNUMBER LZ4F_MAGICNUMBER +#else +#define RAFT_LZ4F_MAGICNUMBER 0x184D2204U +#endif + return byteGet32(&cursor) == RAFT_LZ4F_MAGICNUMBER; +} diff --git a/src/raft/compress.h b/src/raft/compress.h new file mode 100644 index 000000000..b36379fcc --- /dev/null +++ b/src/raft/compress.h @@ -0,0 +1,34 @@ +#ifndef COMPRESS_H_ +#define COMPRESS_H_ + +#include "../raft.h" + +#ifdef LZ4F_HEADER_SIZE_MAX +#define LZ4F_HEADER_SIZE_MAX_RAFT LZ4F_HEADER_SIZE_MAX +#else +#define LZ4F_HEADER_SIZE_MAX_RAFT 19UL +#endif + +/* + * Compresses the content of `bufs` into a newly allocated buffer that is + * returned to the caller through `compressed`. Returns a non-0 value upon + * failure. + */ +int Compress(struct raft_buffer bufs[], + unsigned n_bufs, + struct raft_buffer *compressed, + char *errmsg); + +/* + * Decompresses the content of `buf` into a newly allocated buffer that is + * returned to the caller through `decompressed`. Returns a non-0 value upon + * failure. + */ +int Decompress(struct raft_buffer buf, + struct raft_buffer *decompressed, + char *errmsg); + +/* Returns `true` if `data` is compressed, `false` otherwise. */ +bool IsCompressed(const void *data, size_t sz); + +#endif /* COMPRESS_H_ */ diff --git a/src/raft/configuration.c b/src/raft/configuration.c new file mode 100644 index 000000000..04ca13764 --- /dev/null +++ b/src/raft/configuration.c @@ -0,0 +1,401 @@ +#include "configuration.h" + +#include "../tracing.h" +#include "assert.h" +#include "byte.h" + +/* Current encoding format version. */ +#define ENCODING_FORMAT 1 + +void configurationInit(struct raft_configuration *c) +{ + c->servers = NULL; + c->n = 0; +} + +void configurationClose(struct raft_configuration *c) +{ + size_t i; + assert(c != NULL); + assert(c->n == 0 || c->servers != NULL); + for (i = 0; i < c->n; i++) { + raft_free(c->servers[i].address); + } + if (c->servers != NULL) { + raft_free(c->servers); + } +} + +unsigned configurationIndexOf(const struct raft_configuration *c, + const raft_id id) +{ + unsigned i; + assert(c != NULL); + for (i = 0; i < c->n; i++) { + if (c->servers[i].id == id) { + return i; + } + } + return c->n; +} + +unsigned configurationIndexOfVoter(const struct raft_configuration *c, + const raft_id id) +{ + unsigned i; + unsigned j = 0; + assert(c != NULL); + assert(id > 0); + + for (i = 0; i < c->n; i++) { + if (c->servers[i].id == id) { + if (c->servers[i].role == RAFT_VOTER) { + return j; + } + return c->n; + } + if (c->servers[i].role == RAFT_VOTER) { + j++; + } + } + + return c->n; +} + +const struct raft_server *configurationGet(const struct raft_configuration *c, + const raft_id id) +{ + size_t i; + assert(c != NULL); + assert(id > 0); + + /* Grab the index of the server with the given ID */ + i = configurationIndexOf(c, id); + + if (i == c->n) { + /* No server with matching ID. */ + return NULL; + } + assert(i < c->n); + + return &c->servers[i]; +} + +unsigned configurationVoterCount(const struct raft_configuration *c) +{ + unsigned i; + unsigned n = 0; + assert(c != NULL); + for (i = 0; i < c->n; i++) { + if (c->servers[i].role == RAFT_VOTER) { + n++; + } + } + return n; +} + +int configurationCopy(const struct raft_configuration *src, + struct raft_configuration *dst) +{ + size_t i; + int rv; + + configurationInit(dst); + for (i = 0; i < src->n; i++) { + struct raft_server *server = &src->servers[i]; + rv = configurationAdd(dst, server->id, server->address, + server->role); + if (rv != 0) { + goto err; + } + } + + return 0; + +err: + configurationClose(dst); + assert(rv == RAFT_NOMEM); + return rv; +} + +int configurationAdd(struct raft_configuration *c, + raft_id id, + const char *address, + int role) +{ + struct raft_server *servers; + struct raft_server *server; + char *address_copy; + size_t i; + int rv; + assert(c != NULL); + assert(id != 0); + + if (role != RAFT_STANDBY && role != RAFT_VOTER && role != RAFT_SPARE) { + rv = RAFT_BADROLE; + goto err; + } + + /* Check that neither the given id or address is already in use */ + for (i = 0; i < c->n; i++) { + server = &c->servers[i]; + if (server->id == id) { + rv = RAFT_DUPLICATEID; + goto err; + } + if (strcmp(server->address, address) == 0) { + rv = RAFT_DUPLICATEADDRESS; + goto err; + } + } + + /* Make a copy of the given address */ + address_copy = raft_malloc(strlen(address) + 1); + if (address_copy == NULL) { + rv = RAFT_NOMEM; + goto err; + } + strcpy(address_copy, address); + + /* Grow the servers array.. */ + servers = raft_realloc(c->servers, (c->n + 1) * sizeof *server); + if (servers == NULL) { + rv = RAFT_NOMEM; + goto err_after_address_copy; + } + c->servers = servers; + + /* Fill the newly allocated slot (the last one) with the given details. + */ + server = &servers[c->n]; + server->id = id; + server->address = address_copy; + server->role = role; + + c->n++; + + return 0; + +err_after_address_copy: + raft_free(address_copy); +err: + assert(rv == RAFT_BADROLE || rv == RAFT_DUPLICATEID || + rv == RAFT_DUPLICATEADDRESS || rv == RAFT_NOMEM); + return rv; +} + +int configurationRemove(struct raft_configuration *c, const raft_id id) +{ + unsigned i; + unsigned j; + struct raft_server *servers; + int rv; + + assert(c != NULL); + + i = configurationIndexOf(c, id); + if (i == c->n) { + rv = RAFT_BADID; + goto err; + } + + assert(i < c->n); + + /* If this is the last server in the configuration, reset everything. */ + if (c->n - 1 == 0) { + assert(i == 0); + servers = NULL; + goto out; + } + + /* Create a new servers array. */ + servers = raft_calloc(c->n - 1, sizeof *servers); + if (servers == NULL) { + rv = RAFT_NOMEM; + goto err; + } + + /* Copy the first part of the servers array into a new array, excluding + * the i'th server. */ + for (j = 0; j < i; j++) { + servers[j] = c->servers[j]; + } + + /* Copy the second part of the servers array into a new array. */ + for (j = i + 1; j < c->n; j++) { + servers[j - 1] = c->servers[j]; + } + +out: + /* Release the address of the server that was deleted. */ + raft_free(c->servers[i].address); + + /* Release the old servers array */ + raft_free(c->servers); + + c->servers = servers; + c->n--; + + return 0; + +err: + assert(rv == RAFT_BADID || rv == RAFT_NOMEM); + return rv; +} + +size_t configurationEncodedSize(const struct raft_configuration *c) +{ + size_t n = 0; + unsigned i; + + /* We need one byte for the encoding format version */ + n++; + + /* Then 8 bytes for number of servers. */ + n += sizeof(uint64_t); + + /* Then some space for each server. */ + for (i = 0; i < c->n; i++) { + struct raft_server *server = &c->servers[i]; + assert(server->address != NULL); + n += sizeof(uint64_t); /* Server ID */ + n += strlen(server->address) + 1; /* Address */ + n++; /* Voting flag */ + }; + + return bytePad64(n); +} + +void configurationEncodeToBuf(const struct raft_configuration *c, void *buf) +{ + void *cursor = buf; + unsigned i; + + /* Encoding format version */ + bytePut8(&cursor, ENCODING_FORMAT); + + /* Number of servers. */ + bytePut64(&cursor, c->n); + + for (i = 0; i < c->n; i++) { + struct raft_server *server = &c->servers[i]; + assert(server->address != NULL); + bytePut64(&cursor, server->id); + bytePutString(&cursor, server->address); + assert(server->role < 255); + bytePut8(&cursor, (uint8_t)server->role); + }; +} + +int configurationEncode(const struct raft_configuration *c, + struct raft_buffer *buf) +{ + int rv; + + assert(c != NULL); + assert(buf != NULL); + + /* The configuration can't be empty. */ + assert(c->n > 0); + + buf->len = configurationEncodedSize(c); + buf->base = raft_malloc(buf->len); + if (buf->base == NULL) { + rv = RAFT_NOMEM; + goto err; + } + + configurationEncodeToBuf(c, buf->base); + + return 0; + +err: + assert(rv == RAFT_NOMEM); + return rv; +} + +int configurationDecode(const struct raft_buffer *buf, + struct raft_configuration *c) +{ + const void *cursor; + size_t i; + size_t n; + int rv; + + assert(c != NULL); + assert(buf != NULL); + + /* TODO: use 'if' instead of assert for checking buffer boundaries */ + assert(buf->len > 0); + + configurationInit(c); + + cursor = buf->base; + + /* Check the encoding format version */ + if (byteGet8(&cursor) != ENCODING_FORMAT) { + rv = RAFT_MALFORMED; + goto err; + } + + /* Read the number of servers. */ + n = (size_t)byteGet64(&cursor); + + /* Decode the individual servers. */ + for (i = 0; i < n; i++) { + raft_id id; + const char *address; + int role; + + /* Server ID. */ + id = byteGet64(&cursor); + + /* Server Address. */ + address = byteGetString( + &cursor, buf->len - (size_t)((uint8_t *)cursor - + (uint8_t *)buf->base)); + if (address == NULL) { + rv = RAFT_MALFORMED; + goto err; + } + + /* Role code. */ + role = byteGet8(&cursor); + + rv = configurationAdd(c, id, address, role); + if (rv != 0) { + /* Only valid configurations should be ever be encoded, + * so in case configurationAdd() fails because of + * invalid data we return RAFT_MALFORMED. */ + if (rv != RAFT_NOMEM) { + rv = RAFT_MALFORMED; + } + goto err; + } + } + + return 0; + +err: + assert(rv == RAFT_MALFORMED || rv == RAFT_NOMEM); + configurationClose(c); + return rv; +} + +void configurationTrace(const struct raft *r, + struct raft_configuration *c, + const char *msg) +{ + (void)r; + tracef("%s", msg); + tracef("=== CONFIG START ==="); + unsigned i; + struct raft_server *s; + for (i = 0; i < c->n; i++) { + s = &c->servers[i]; + tracef("id:%llu address:%s role:%d", s->id, s->address, + s->role); + } + tracef("=== CONFIG END ==="); +} +#undef tracef diff --git a/src/raft/configuration.h b/src/raft/configuration.h new file mode 100644 index 000000000..dc1429c9b --- /dev/null +++ b/src/raft/configuration.h @@ -0,0 +1,131 @@ +/* Modify and inspect @raft_configuration objects. */ + +#ifndef CONFIGURATION_H_ +#define CONFIGURATION_H_ + +#include "../raft.h" + +/* Initialize an empty configuration. */ +void configurationInit(struct raft_configuration *c); + +/* Release all memory used by the given configuration. */ +void configurationClose(struct raft_configuration *c); + +/* Add a server to the given configuration. + * + * The given @address is copied and no reference to it is kept. In case of + * error, @c is left unchanged. + * + * Errors: + * + * RAFT_DUPLICATEID + * @c already has a server with the given id. + * + * RAFT_DUPLICATEADDRESS + * @c already has a server with the given @address. + * + * RAFT_BADROLE + * @role is not one of ROLE_STANDBY, ROLE_VOTER or ROLE_SPARE. + * + * RAFT_NOMEM + * A copy of @address could not me made or the @c->servers could not + * be extended + */ +int configurationAdd(struct raft_configuration *c, + raft_id id, + const char *address, + int role); + +/* Return the number of servers with the RAFT_VOTER role. */ +unsigned configurationVoterCount(const struct raft_configuration *c); + +/* Return the index of the server with the given ID (relative to the c->servers + * array). If there's no server with the given ID, return the number of + * servers. */ +unsigned configurationIndexOf(const struct raft_configuration *c, raft_id id); + +/* Return the index of the RAFT_VOTER server with the given ID (relative to the + * sub array of c->servers that has only voting servers). If there's no server + * with the given ID, or if it's not flagged as voting, return the number of + * servers. */ +unsigned configurationIndexOfVoter(const struct raft_configuration *c, + raft_id id); + +/* Get the server with the given ID, or #NULL if no matching server is found. */ +const struct raft_server *configurationGet(const struct raft_configuration *c, + raft_id id); + +/* Remove a server from a raft configuration. The given ID must match the one of + * an existing server in the configuration. + * + * In case of error @c is left unchanged. + * + * Errors: + * + * RAFT_BADID + * @c does not contain any server with the given @id + * + * RAFT_NOMEM + * Memory to hold the new set of servers could not be allocated. + */ +int configurationRemove(struct raft_configuration *c, raft_id id); + +/* Deep copy @src to @dst. + * + * The configuration @src is assumed to be valid (i.e. each of its servers has a + * valid ID, address and role). + * + * The @dst configuration object must be uninitialized or empty. + * + * In case of error, both @src and @dst are left unchanged. + * + * Errors: + * + * RAFT_NOMEM + * Memory to copy all the servers could not be allocated. + */ +int configurationCopy(const struct raft_configuration *src, + struct raft_configuration *dst); + +/* Number of bytes needed to encode the given configuration object. */ +size_t configurationEncodedSize(const struct raft_configuration *c); + +/* Encode the given configuration object to the given pre-allocated buffer, + * which is assumed to be at least configurationEncodedSize(c) bytes. */ +void configurationEncodeToBuf(const struct raft_configuration *c, void *buf); + +/* Encode the given configuration object. The memory of the returned buffer is + * allocated using raft_malloc(), and client code is responsible for releasing + * it when no longer needed. + * + * Errors: + * + * RAFT_NOMEM + * Memory for the encoded buffer could not be allocated. + */ +int configurationEncode(const struct raft_configuration *c, + struct raft_buffer *buf); + +/* Populate a configuration object by decoding the given serialized payload. + * + * The @c configuration object must be uninitialized or empty. + * + * In case of error, @c will be left empty. + * + * Errors: + * + * RAFT_MALFORMED + * The given buffer does not contain a valid encoded configuration. + * + * RAFT_NOMEM + * Memory to populate the given configuration could not be allocated. + */ +int configurationDecode(const struct raft_buffer *buf, + struct raft_configuration *c); + +/* Output the configuration to the raft tracer */ +void configurationTrace(const struct raft *r, + struct raft_configuration *c, + const char *msg); + +#endif /* CONFIGURATION_H_ */ diff --git a/src/raft/convert.c b/src/raft/convert.c new file mode 100644 index 000000000..1c4d52d25 --- /dev/null +++ b/src/raft/convert.c @@ -0,0 +1,271 @@ +#include "convert.h" + +#include "../raft.h" +#include "../tracing.h" +#include "assert.h" +#include "callbacks.h" +#include "configuration.h" +#include "election.h" +#include "log.h" +#include "membership.h" +#include "progress.h" +#include "queue.h" +#include "replication.h" +#include "request.h" + +/* Convenience for setting a new state value and asserting that the transition + * is valid. */ +static void convertSetState(struct raft *r, unsigned short new_state) +{ + /* Check that the transition is legal, see Figure 3.3. Note that with + * respect to the paper we have an additional "unavailable" state, which + * is the initial or final state. */ + unsigned short old_state = r->state; + tracef("old_state:%u new_state:%u", old_state, new_state); + assert((r->state == RAFT_UNAVAILABLE && new_state == RAFT_FOLLOWER) || + (r->state == RAFT_FOLLOWER && new_state == RAFT_CANDIDATE) || + (r->state == RAFT_CANDIDATE && new_state == RAFT_FOLLOWER) || + (r->state == RAFT_CANDIDATE && new_state == RAFT_LEADER) || + (r->state == RAFT_LEADER && new_state == RAFT_FOLLOWER) || + (r->state == RAFT_FOLLOWER && new_state == RAFT_UNAVAILABLE) || + (r->state == RAFT_CANDIDATE && new_state == RAFT_UNAVAILABLE) || + (r->state == RAFT_LEADER && new_state == RAFT_UNAVAILABLE)); + r->state = new_state; + if (r->state == RAFT_LEADER) { + r->leader_state.voter_contacts = 1; + } + + struct raft_callbacks *cbs = raftGetCallbacks(r); + if (cbs != NULL && cbs->state_cb != NULL) { + cbs->state_cb(r, old_state, new_state); + } +} + +/* Clear follower state. */ +static void convertClearFollower(struct raft *r) +{ + tracef("clear follower state"); + r->follower_state.current_leader.id = 0; + if (r->follower_state.current_leader.address != NULL) { + raft_free(r->follower_state.current_leader.address); + } + r->follower_state.current_leader.address = NULL; +} + +/* Clear candidate state. */ +static void convertClearCandidate(struct raft *r) +{ + tracef("clear candidate state"); + if (r->candidate_state.votes != NULL) { + raft_free(r->candidate_state.votes); + r->candidate_state.votes = NULL; + } +} + +static void convertFailApply(struct raft_apply *req) +{ + if (req != NULL && req->cb != NULL) { + req->cb(req, RAFT_LEADERSHIPLOST, NULL); + } +} + +static void convertFailBarrier(struct raft_barrier *req) +{ + if (req != NULL && req->cb != NULL) { + req->cb(req, RAFT_LEADERSHIPLOST); + } +} + +static void convertFailChange(struct raft_change *req) +{ + if (req != NULL && req->cb != NULL) { + req->cb(req, RAFT_LEADERSHIPLOST); + } +} + +/* Clear leader state. */ +static void convertClearLeader(struct raft *r) +{ + tracef("clear leader state"); + if (r->leader_state.progress != NULL) { + raft_free(r->leader_state.progress); + r->leader_state.progress = NULL; + } + + /* Fail all outstanding requests */ + while (!QUEUE_IS_EMPTY(&r->leader_state.requests)) { + struct request *req; + queue *head; + head = QUEUE_HEAD(&r->leader_state.requests); + QUEUE_REMOVE(head); + req = QUEUE_DATA(head, struct request, queue); + assert(req->type == RAFT_COMMAND || req->type == RAFT_BARRIER); + switch (req->type) { + case RAFT_COMMAND: + convertFailApply((struct raft_apply *)req); + break; + case RAFT_BARRIER: + convertFailBarrier((struct raft_barrier *)req); + break; + }; + } + + /* Fail any promote request that is still outstanding because the server + * is still catching up and no entry was submitted. */ + if (r->leader_state.change != NULL) { + convertFailChange(r->leader_state.change); + r->leader_state.change = NULL; + } +} + +/* Clear the current state */ +static void convertClear(struct raft *r) +{ + assert(r->state == RAFT_UNAVAILABLE || r->state == RAFT_FOLLOWER || + r->state == RAFT_CANDIDATE || r->state == RAFT_LEADER); + switch (r->state) { + case RAFT_FOLLOWER: + convertClearFollower(r); + break; + case RAFT_CANDIDATE: + convertClearCandidate(r); + break; + case RAFT_LEADER: + convertClearLeader(r); + break; + } +} + +void convertToFollower(struct raft *r) +{ + convertClear(r); + convertSetState(r, RAFT_FOLLOWER); + + /* Reset election timer. */ + electionResetTimer(r); + + r->follower_state.current_leader.id = 0; + r->follower_state.current_leader.address = NULL; + r->follower_state.append_in_flight_count = 0; +} + +int convertToCandidate(struct raft *r, bool disrupt_leader) +{ + const struct raft_server *server; + size_t n_voters = configurationVoterCount(&r->configuration); + int rv; + + (void)server; /* Only used for assertions. */ + + convertClear(r); + convertSetState(r, RAFT_CANDIDATE); + + /* Allocate the votes array. */ + r->candidate_state.votes = raft_malloc(n_voters * sizeof(bool)); + if (r->candidate_state.votes == NULL) { + return RAFT_NOMEM; + } + r->candidate_state.disrupt_leader = disrupt_leader; + r->candidate_state.in_pre_vote = disrupt_leader ? false : r->pre_vote; + + /* Fast-forward to leader if we're the only voting server in the + * configuration. */ + server = configurationGet(&r->configuration, r->id); + assert(server != NULL); + assert(server->role == RAFT_VOTER); + + if (n_voters == 1) { + tracef("self elect and convert to leader"); + return convertToLeader(r); + } + + /* Start a new election round */ + rv = electionStart(r); + if (rv != 0) { + r->state = RAFT_FOLLOWER; + raft_free(r->candidate_state.votes); + return rv; + } + + return 0; +} + +void convertInitialBarrierCb(struct raft_barrier *req, int status) +{ + (void)status; + raft_free(req); +} + +int convertToLeader(struct raft *r) +{ + int rv; + + tracef("become leader for term %llu", r->current_term); + + convertClear(r); + convertSetState(r, RAFT_LEADER); + + /* Reset timers */ + r->election_timer_start = r->io->time(r->io); + + /* Reset apply requests queue */ + QUEUE_INIT(&r->leader_state.requests); + + /* Allocate and initialize the progress array. */ + rv = progressBuildArray(r); + if (rv != 0) { + return rv; + } + + r->leader_state.change = NULL; + + /* Reset promotion state. */ + r->leader_state.promotee_id = 0; + r->leader_state.round_number = 0; + r->leader_state.round_index = 0; + r->leader_state.round_start = 0; + + /* By definition, all entries until the last_stored entry will be + * committed if we are the only voter around. */ + size_t n_voters = configurationVoterCount(&r->configuration); + if (n_voters == 1 && (r->last_stored > r->commit_index)) { + tracef("apply log entries after self election %llu %llu", + r->last_stored, r->commit_index); + r->commit_index = r->last_stored; + rv = replicationApply(r); + } else if (n_voters > 1) { + /* Raft Dissertation, paragraph 6.4: + * The Leader Completeness Property guarantees that a leader has + * all committed entries, but at the start of its term, it may + * not know which those are. To find out, it needs to commit an + * entry from its term. Raft handles this by having each leader + * commit a blank no-op entry into the log at the start of its + * term. */ + struct raft_barrier *req = raft_malloc(sizeof(*req)); + if (req == NULL) { + return RAFT_NOMEM; + } + rv = raft_barrier(r, req, convertInitialBarrierCb); + if (rv != 0) { + tracef( + "failed to send no-op barrier entry after leader " + "conversion: " + "%d", + rv); + } + } + + return rv; +} + +void convertToUnavailable(struct raft *r) +{ + /* Abort any pending leadership transfer request. */ + if (r->transfer != NULL) { + membershipLeadershipTransferClose(r); + } + convertClear(r); + convertSetState(r, RAFT_UNAVAILABLE); +} + +#undef tracef diff --git a/src/raft/convert.h b/src/raft/convert.h new file mode 100644 index 000000000..face1468e --- /dev/null +++ b/src/raft/convert.h @@ -0,0 +1,52 @@ +/* Convert from one state to another. */ + +#ifndef CONVERT_H_ +#define CONVERT_H_ + +#include "../raft.h" + +/* Convert from unavailable, or candidate or leader to follower. + * + * From Figure 3.1: + * + * If election timeout elapses without receiving AppendEntries RPC from + * current leader or granting vote to candidate: convert to candidate. + * + * The above implies that we need to reset the election timer when converting to + * follower. */ +void convertToFollower(struct raft *r); + +/* Convert from follower to candidate, starting a new election. + * + * From Figure 3.1: + * + * On conversion to candidate, start election + * + * If the disrupt_leader flag is true, the server will set the disrupt leader + * flag of the RequestVote messages it sends. */ +int convertToCandidate(struct raft *r, bool disrupt_leader); + +/* Convert from candidate to leader. + * + * From Figure 3.1: + * + * Upon election: send initial empty AppendEntries RPC (heartbeat) to each + * server. + * + * From Section 3.4: + * + * Once a candidate wins an election, it becomes leader. It then sends + * heartbeat messages to all of the other servers to establish its authority + * and prevent new elections. + * + * From Section 3.3: + * + * The leader maintains a nextIndex for each follower, which is the index + * of the next log entry the leader will send to that follower. When a + * leader first comes to power, it initializes all nextIndex values to the + * index just after the last one in its log. */ +int convertToLeader(struct raft *r); + +void convertToUnavailable(struct raft *r); + +#endif /* CONVERT_H_ */ diff --git a/src/raft/election.c b/src/raft/election.c new file mode 100644 index 000000000..ecdcd20f0 --- /dev/null +++ b/src/raft/election.c @@ -0,0 +1,327 @@ +#include "election.h" + +#include "../tracing.h" +#include "assert.h" +#include "configuration.h" +#include "heap.h" +#include "log.h" + +/* Common fields between follower and candidate state. + * + * The follower_state and candidate_state structs in raft.h must be kept + * consistent with this definition. */ +struct followerOrCandidateState +{ + unsigned randomized_election_timeout; +}; + +/* Return a pointer to either the follower or candidate state. */ +struct followerOrCandidateState *getFollowerOrCandidateState(struct raft *r) +{ + struct followerOrCandidateState *state; + assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE); + if (r->state == RAFT_FOLLOWER) { + state = (struct followerOrCandidateState *)&r->follower_state; + } else { + state = (struct followerOrCandidateState *)&r->candidate_state; + } + return state; +} + +void electionResetTimer(struct raft *r) +{ + struct followerOrCandidateState *state = getFollowerOrCandidateState(r); + unsigned timeout = (unsigned)r->io->random( + r->io, (int)r->election_timeout, 2 * (int)r->election_timeout); + assert(timeout >= r->election_timeout); + assert(timeout <= r->election_timeout * 2); + state->randomized_election_timeout = timeout; + r->election_timer_start = r->io->time(r->io); +} + +bool electionTimerExpired(struct raft *r) +{ + struct followerOrCandidateState *state = getFollowerOrCandidateState(r); + raft_time now = r->io->time(r->io); + return now - r->election_timer_start >= + state->randomized_election_timeout; +} + +static void sendRequestVoteCb(struct raft_io_send *send, int status) +{ + (void)status; + RaftHeapFree(send); +} + +/* Send a RequestVote RPC to the given server. */ +static int electionSend(struct raft *r, const struct raft_server *server) +{ + struct raft_message message; + struct raft_io_send *send; + raft_term term; + int rv; + assert(server->id != r->id); + assert(server->id != 0); + + /* If we are in the pre-vote phase, we indicate our future term in the + * request. */ + term = r->current_term; + if (r->candidate_state.in_pre_vote) { + term++; + } + + /* Fill the RequestVote message. + * + * Note that we set last_log_index and last_log_term to the index and + * term of the last persisted entry, to the last entry in our in-memory + * log cache, because we must advertise only log entries that can't be + * lost at restart. + * + * Also note that, for a similar reason, we apply pending configuration + * changes only once they are persisted. When running an election we + * then use only persisted information, which is safe (while using + * unpersisted information for the log and persisted information for the + * configuration or viceversa would lead to inconsistencies and + * violations of Raft invariants). + */ + message.type = RAFT_IO_REQUEST_VOTE; + message.request_vote.term = term; + message.request_vote.candidate_id = r->id; + message.request_vote.last_log_index = r->last_stored; + message.request_vote.last_log_term = logTermOf(r->log, r->last_stored); + message.request_vote.disrupt_leader = r->candidate_state.disrupt_leader; + message.request_vote.pre_vote = r->candidate_state.in_pre_vote; + message.server_id = server->id; + message.server_address = server->address; + + send = RaftHeapMalloc(sizeof *send); + if (send == NULL) { + return RAFT_NOMEM; + } + + send->data = r; + + rv = r->io->send(r->io, send, &message, sendRequestVoteCb); + if (rv != 0) { + RaftHeapFree(send); + return rv; + } + + return 0; +} + +int electionStart(struct raft *r) +{ + raft_term term; + size_t n_voters; + size_t voting_index; + size_t i; + int rv; + assert(r->state == RAFT_CANDIDATE); + + n_voters = configurationVoterCount(&r->configuration); + voting_index = configurationIndexOfVoter(&r->configuration, r->id); + + /* This function should not be invoked if we are not a voting server, + * hence voting_index must be lower than the number of servers in the + * configuration (meaning that we are a voting server). */ + assert(voting_index < r->configuration.n); + + /* Coherence check that configurationVoterCount and + * configurationIndexOfVoter have returned something that makes sense. + */ + assert(n_voters <= r->configuration.n); + assert(voting_index < n_voters); + + /* During pre-vote we don't increment our term, or reset our vote. + * Resetting our vote could lead to double-voting if we were to receive + * a RequestVote RPC during our Candidate state while we already voted + * for a server during the term. */ + if (!r->candidate_state.in_pre_vote) { + /* Increment current term */ + term = r->current_term + 1; + rv = r->io->set_term(r->io, term); + if (rv != 0) { + tracef("set_term failed %d", rv); + goto err; + } + tracef("beginning of term %llu", term); + + /* Vote for self */ + rv = r->io->set_vote(r->io, r->id); + if (rv != 0) { + tracef("set_vote self failed %d", rv); + goto err; + } + + /* Update our cache too. */ + r->current_term = term; + r->voted_for = r->id; + } + + /* Reset election timer. */ + electionResetTimer(r); + + assert(r->candidate_state.votes != NULL); + + /* Initialize the votes array and send vote requests. */ + for (i = 0; i < n_voters; i++) { + if (i == voting_index) { + r->candidate_state.votes[i] = + true; /* We vote for ourselves */ + } else { + r->candidate_state.votes[i] = false; + } + } + for (i = 0; i < r->configuration.n; i++) { + const struct raft_server *server = &r->configuration.servers[i]; + if (server->id == r->id || server->role != RAFT_VOTER) { + continue; + } + rv = electionSend(r, server); + if (rv != 0) { + /* This is not a critical failure, let's just log it. */ + tracef("failed to send vote request to server %llu: %s", + server->id, raft_strerror(rv)); + } + } + + return 0; + +err: + assert(rv != 0); + return rv; +} + +int electionVote(struct raft *r, + const struct raft_request_vote *args, + bool *granted) +{ + const struct raft_server *local_server; + raft_index local_last_index; + raft_term local_last_term; + bool is_transferee; /* Requester is the target of a leadership transfer + */ + int rv; + + assert(r != NULL); + assert(args != NULL); + assert(granted != NULL); + + local_server = configurationGet(&r->configuration, r->id); + + *granted = false; + + if (local_server == NULL || local_server->role != RAFT_VOTER) { + tracef("local server is not voting -> not granting vote"); + return 0; + } + + is_transferee = + r->transfer != NULL && r->transfer->id == args->candidate_id; + if (!args->pre_vote && r->voted_for != 0 && + r->voted_for != args->candidate_id && !is_transferee) { + tracef("local server already voted -> not granting vote"); + return 0; + } + + /* Raft Dissertation 9.6: + * > In the Pre-Vote algorithm, a candidate + * > only increments its term if it first learns from a majority of the + * > cluster that they would be willing + * > to grant the candidate their votes (if the candidate's log is + * > sufficiently up-to-date, and the voters + * > have not received heartbeats from a valid leader for at least a + * baseline > election timeout) Arriving here means that in a pre-vote + * phase, we will cast our vote if the candidate's log is sufficiently + * up-to-date, no matter what the candidate's term is. We have already + * checked if we currently have a leader upon reception of the + * RequestVote RPC, meaning the 2 conditions will be satisfied if the + * candidate's log is up-to-date. + * */ + local_last_index = logLastIndex(r->log); + + /* Our log is definitely not more up-to-date if it's empty! */ + if (local_last_index == 0) { + tracef("local log is empty -> granting vote"); + goto grant_vote; + } + + local_last_term = logLastTerm(r->log); + + if (args->last_log_term < local_last_term) { + /* The requesting server has last entry's log term lower than + * ours. */ + tracef( + "local last entry %llu has term %llu higher than %llu -> " + "not " + "granting", + local_last_index, local_last_term, args->last_log_term); + return 0; + } + + if (args->last_log_term > local_last_term) { + /* The requesting server has a more up-to-date log. */ + tracef( + "remote last entry %llu has term %llu higher than %llu -> " + "granting vote", + args->last_log_index, args->last_log_term, local_last_term); + goto grant_vote; + } + + /* The term of the last log entry is the same, so let's compare the + * length of the log. */ + assert(args->last_log_term == local_last_term); + + if (local_last_index <= args->last_log_index) { + /* Our log is shorter or equal to the one of the requester. */ + tracef( + "remote log equal or longer than local -> granting vote"); + goto grant_vote; + } + + tracef("remote log shorter than local -> not granting vote"); + + return 0; + +grant_vote: + if (!args->pre_vote) { + rv = r->io->set_vote(r->io, args->candidate_id); + if (rv != 0) { + tracef("set_vote failed %d", rv); + return rv; + } + r->voted_for = args->candidate_id; + + /* Reset the election timer. */ + r->election_timer_start = r->io->time(r->io); + } + + tracef("vote granted to %llu", args->candidate_id); + *granted = true; + + return 0; +} + +bool electionTally(struct raft *r, size_t voter_index) +{ + size_t n_voters = configurationVoterCount(&r->configuration); + size_t votes = 0; + size_t i; + size_t half = n_voters / 2; + + assert(r->state == RAFT_CANDIDATE); + assert(r->candidate_state.votes != NULL); + + r->candidate_state.votes[voter_index] = true; + + for (i = 0; i < n_voters; i++) { + if (r->candidate_state.votes[i]) { + votes++; + } + } + + return votes >= half + 1; +} + +#undef tracef diff --git a/src/raft/election.h b/src/raft/election.h new file mode 100644 index 000000000..0ead5503a --- /dev/null +++ b/src/raft/election.h @@ -0,0 +1,81 @@ +/* Election-related logic and helpers. */ + +#ifndef ELECTION_H_ +#define ELECTION_H_ + +#include "../raft.h" + +/* Reset the election_timer clock and set randomized_election_timeout to a + * random value between election_timeout and 2 * election_timeout. + * + * From Section 3.4: + * + * Raft uses randomized election timeouts to ensure that split votes are rare + * and that they are resolved quickly. To prevent split votes in the first + * place, election timeouts are chosen randomly from a fixed interval (e.g., + * 150-300 ms). This spreads out the servers so that in most cases only a + * single server will time out. + * + * From Section 9.4: + * + * We used AvailSim to approximate a WAN spanning the continental US. Each + * message was assigned a latency chosen randomly from the uniform range of + * 30-40 ms, and the servers' election timeout range was set accordingly to + * 300-600 ms (about 10-20 times the one-way network latency). When only one + * of the five servers has failed, the average election completes within about + * 475 ms, and 99.9% of elections complete within 1.5 s. Even when two of the + * five servers have failed, the average election takes about 650 ms (about 20 + * times the one-way network latency), and 99.9% of elections complete in 3 + * s. We believe these election times are more than adequate for most WAN + * deployments. + * + * Must be called in follower or candidate state. */ +void electionResetTimer(struct raft *r); + +/* Return true if the election timer has expired. + * + * Must be called in follower or candidate state. */ +bool electionTimerExpired(struct raft *r); + +/* Start a new election round. + * + * From Figure 3.1: + * + * [Rules for Servers] Candidates: On conversion to candidates, start + * election: + * + * - Increment current term + * - Vote for self + * - Reset election timer + * - Send RequestVote RPCs to all other servers + * + * From Section 3.4: + * + * To begin an election, a follower increments its current term and + * transitions to candidate state. It then votes for itself and issues + * RequestVote RPCs in parallel to each of the other servers in the + * cluster. + */ +int electionStart(struct raft *r); + +/* Decide whether our vote should be granted to the requesting server and update + * our state accordingly. + * + * From Figure 3.1: + * + * RequestVote RPC: Receiver Implementation: + * + * - If votedFor is null or candidateId, and candidate's log is at least as + * up-to-date as receiver's log, grant vote. + * + * The outcome of the decision is stored through the @granted pointer. */ +int electionVote(struct raft *r, + const struct raft_request_vote *args, + bool *granted); + +/* Update the votes array by adding the vote from the server at the given + * index. Return true if with this vote the server has reached the majority of + * votes and won elections. */ +bool electionTally(struct raft *r, size_t voter_index); + +#endif /* ELECTION_H_ */ diff --git a/src/raft/entry.c b/src/raft/entry.c new file mode 100644 index 000000000..15ac56725 --- /dev/null +++ b/src/raft/entry.c @@ -0,0 +1,84 @@ +#include +#include + +#include "assert.h" +#include "entry.h" + +void entryBatchesDestroy(struct raft_entry *entries, const size_t n) +{ + void *batch = NULL; + size_t i; + if (entries == NULL) { + assert(n == 0); + return; + } + assert(n > 0); + for (i = 0; i < n; i++) { + assert(entries[i].batch != NULL); + if (entries[i].batch != batch) { + batch = entries[i].batch; + raft_free(batch); + } + } + raft_free(entries); +} + +int entryCopy(const struct raft_entry *src, struct raft_entry *dst) +{ + dst->term = src->term; + dst->type = src->type; + dst->buf.len = src->buf.len; + dst->buf.base = raft_malloc(dst->buf.len); + if (dst->buf.len > 0 && dst->buf.base == NULL) { + return RAFT_NOMEM; + } + memcpy(dst->buf.base, src->buf.base, dst->buf.len); + dst->batch = NULL; + return 0; +} + +int entryBatchCopy(const struct raft_entry *src, + struct raft_entry **dst, + const size_t n) +{ + size_t size = 0; + void *batch; + uint8_t *cursor; + unsigned i; + + if (n == 0) { + *dst = NULL; + return 0; + } + + /* Calculate the total size of the entries content and allocate the + * batch. */ + for (i = 0; i < n; i++) { + size += src[i].buf.len; + } + + batch = raft_malloc(size); + if (batch == NULL) { + return RAFT_NOMEM; + } + + /* Copy the entries. */ + *dst = raft_malloc(n * sizeof **dst); + if (*dst == NULL) { + raft_free(batch); + return RAFT_NOMEM; + } + + cursor = batch; + + for (i = 0; i < n; i++) { + (*dst)[i].term = src[i].term; + (*dst)[i].type = src[i].type; + (*dst)[i].buf.base = cursor; + (*dst)[i].buf.len = src[i].buf.len; + (*dst)[i].batch = batch; + memcpy((*dst)[i].buf.base, src[i].buf.base, src[i].buf.len); + cursor += src[i].buf.len; + } + return 0; +} diff --git a/src/raft/entry.h b/src/raft/entry.h new file mode 100644 index 000000000..b571ebb8c --- /dev/null +++ b/src/raft/entry.h @@ -0,0 +1,19 @@ +#ifndef ENTRY_H_ +#define ENTRY_H_ + +#include "../raft.h" + +/* Release all memory associated with the given entries, including the array + * itself. The entries are supposed to belong to one or more batches. */ +void entryBatchesDestroy(struct raft_entry *entries, size_t n); + +/* Create a copy of a log entry, including its data. */ +int entryCopy(const struct raft_entry *src, struct raft_entry *dst); + +/* Create a single batch of entries containing a copy of the given entries, + * including their data. */ +int entryBatchCopy(const struct raft_entry *src, + struct raft_entry **dst, + size_t n); + +#endif /* ENTRY_H */ diff --git a/src/raft/err.c b/src/raft/err.c new file mode 100644 index 000000000..cc6c5cdad --- /dev/null +++ b/src/raft/err.c @@ -0,0 +1,72 @@ +#include "err.h" + +#include + +#include "../raft.h" +#include "assert.h" + +#define WRAP_SEP ": " +#define WRAP_SEP_LEN ((size_t)strlen(WRAP_SEP)) + +void errMsgWrap(char *e, const char *format) +{ + size_t n = RAFT_ERRMSG_BUF_SIZE; + size_t prefix_n; + size_t prefix_and_sep_n; + size_t trail_n; + size_t i; + + /* Calculate the length of the prefix. */ + prefix_n = strlen(format); + + /* If there isn't enough space for the ": " separator and at least one + * character of the wrapped error message, then just print the prefix. + */ + if (prefix_n >= n - (WRAP_SEP_LEN + 1)) { +/* We explicitly allow truncation here + silence clang about unknown + * warning-group "-Wformat-truncation" */ +#ifdef __GNUC__ +#ifndef __clang__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-truncation" +#endif +#endif + ErrMsgPrintf(e, "%s", format); +#ifdef __GNUC__ +#ifndef __clang__ +#pragma GCC diagnostic pop +#endif +#endif + return; + } + + /* Right-shift the wrapped message, to make room for the prefix. */ + prefix_and_sep_n = prefix_n + WRAP_SEP_LEN; + trail_n = strnlen(e, n - prefix_and_sep_n - 1); + memmove(e + prefix_and_sep_n, e, trail_n); + e[prefix_and_sep_n + trail_n] = 0; + + /* Print the prefix. */ + ErrMsgPrintf(e, "%s", format); + + /* Print the separator. + * + * Avoid using strncpy(e->msg + prefix_n, WRAP_SEP, WRAP_SEP_LEN) since + * it generates a warning. */ + for (i = 0; i < WRAP_SEP_LEN; i++) { + e[prefix_n + i] = WRAP_SEP[i]; + } +} + +#define ERR_CODE_TO_STRING_CASE(CODE, MSG) \ + case CODE: \ + return MSG; + +const char *errCodeToString(int code) +{ + switch (code) { + ERR_CODE_TO_STRING_MAP(ERR_CODE_TO_STRING_CASE); + default: + return "unknown error"; + } +} diff --git a/src/raft/err.h b/src/raft/err.h new file mode 100644 index 000000000..fb157ce90 --- /dev/null +++ b/src/raft/err.h @@ -0,0 +1,67 @@ +/* Utilities around error handling. */ + +#ifndef ERROR_H_ +#define ERROR_H_ + +#include +#include + +#define ERR_CODE_TO_STRING_MAP(X) \ + X(RAFT_NOMEM, "out of memory") \ + X(RAFT_BADID, "server ID is not valid") \ + X(RAFT_DUPLICATEID, "server ID already in use") \ + X(RAFT_DUPLICATEADDRESS, "server address already in use") \ + X(RAFT_BADROLE, "server role is not valid") \ + X(RAFT_MALFORMED, "encoded data is malformed") \ + X(RAFT_NOTLEADER, "server is not the leader") \ + X(RAFT_LEADERSHIPLOST, "server has lost leadership") \ + X(RAFT_SHUTDOWN, "server is shutting down") \ + X(RAFT_CANTBOOTSTRAP, "bootstrap only works on new clusters") \ + X(RAFT_CANTCHANGE, "a configuration change is already in progress") \ + X(RAFT_CORRUPT, "persisted data is corrupted") \ + X(RAFT_CANCELED, "operation canceled") \ + X(RAFT_NAMETOOLONG, "resource name too long") \ + X(RAFT_TOOBIG, "data is too big") \ + X(RAFT_NOCONNECTION, "no connection to remote server available") \ + X(RAFT_BUSY, "operation can't be performed at this time") \ + X(RAFT_IOERR, "I/O error") \ + X(RAFT_NOTFOUND, "Resource not found") \ + X(RAFT_INVALID, "Invalid parameter") \ + X(RAFT_UNAUTHORIZED, "No access to resource") \ + X(RAFT_NOSPACE, "Not enough disk space") \ + X(RAFT_TOOMANY, "System or raft limit met or exceeded") + +/* Format an error message. */ +#define ErrMsgPrintf(ERRMSG, ...) \ + snprintf(ERRMSG, RAFT_ERRMSG_BUF_SIZE, __VA_ARGS__) + +/* Wrap the given error message with an additional prefix message.. */ +#define ErrMsgWrapf(ERRMSG, ...) \ + do { \ + char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \ + ErrMsgPrintf(_errmsg, __VA_ARGS__); \ + errMsgWrap(ERRMSG, _errmsg); \ + } while (0) + +void errMsgWrap(char *e, const char *format); + +/* Transfer an error message from an object to another, wrapping it. */ +#define ErrMsgTransfer(ERRMSG1, ERRMSG2, FORMAT) \ + memcpy(ERRMSG2, ERRMSG1, RAFT_ERRMSG_BUF_SIZE); \ + ErrMsgWrapf(ERRMSG2, FORMAT) + +#define ErrMsgTransferf(ERRMSG1, ERRMSG2, FORMAT, ...) \ + memcpy(ERRMSG2, ERRMSG1, RAFT_ERRMSG_BUF_SIZE); \ + ErrMsgWrapf(ERRMSG2, FORMAT, __VA_ARGS__) + +/* Use the static error message for the error with the given code. */ +#define ErrMsgFromCode(ERRMSG, CODE) \ + ErrMsgPrintf(ERRMSG, "%s", errCodeToString(CODE)) + +/* Format the out of memory error message. */ +#define ErrMsgOom(ERRMSG) ErrMsgFromCode(ERRMSG, RAFT_NOMEM) + +/* Convert a numeric raft error code to a human-readable error message. */ +const char *errCodeToString(int code); + +#endif /* ERROR_H_ */ diff --git a/src/raft/fixture.c b/src/raft/fixture.c new file mode 100644 index 000000000..cef8cfc41 --- /dev/null +++ b/src/raft/fixture.c @@ -0,0 +1,1995 @@ +#include +#include +#include +#include + +#include "../raft.h" +#include "../tracing.h" +#include "assert.h" +#include "configuration.h" +#include "convert.h" +#include "entry.h" +#include "log.h" +#include "queue.h" +#include "snapshot.h" + +/* Defaults */ +#define HEARTBEAT_TIMEOUT 100 +#define INSTALL_SNAPSHOT_TIMEOUT 30000 +#define ELECTION_TIMEOUT 1000 +#define NETWORK_LATENCY 15 +#define DISK_LATENCY 10 +#define WORK_DURATION 200 +#define SEND_LATENCY 0 + +/* To keep in sync with raft.h */ +#define N_MESSAGE_TYPES 6 + +/* Maximum number of peer stub instances connected to a certain stub + * instance. This should be enough for testing purposes. */ +#define MAX_PEERS 8 + +struct raft_fixture_server +{ + bool alive; /* If false, the server is down. */ + raft_id id; /* Server ID. */ + char address[16]; /* Server address (stringified ID). */ + struct raft_tracer tracer; /* Tracer. */ + struct raft_io io; /* In-memory raft_io implementation. */ + struct raft raft; /* Raft instance. */ +}; + +struct raft_fixture_event +{ + unsigned server_index; /* Index of the server the event occurred on. */ + int type; /* Type of the event. */ +}; + +RAFT_API int raft_fixture_event_type(struct raft_fixture_event *event) +{ + assert(event != NULL); + return event->type; +} + +RAFT_API unsigned raft_fixture_event_server_index( + struct raft_fixture_event *event) +{ + assert(event != NULL); + return event->server_index; +} + +/* Fields common across all request types. */ +#define REQUEST \ + int type; /* Request code type. */ \ + raft_time completion_time; /* When the request should be fulfilled. */ \ + queue queue /* Link the I/O pending requests queue. */ + +/* Request type codes. */ +enum { APPEND = 1, SEND, TRANSMIT, SNAPSHOT_PUT, SNAPSHOT_GET, ASYNC_WORK }; + +/* Abstract base type for an asynchronous request submitted to the stub I/o + * implementation. */ +struct ioRequest +{ + REQUEST; +}; + +/* Pending request to append entries to the log. */ +struct append +{ + REQUEST; + struct raft_io_append *req; + const struct raft_entry *entries; + unsigned n; + unsigned start; /* Request timestamp. */ +}; + +/* Pending request to send a message. */ +struct send +{ + REQUEST; + struct raft_io_send *req; + struct raft_message message; +}; + +/* Pending request to store a snapshot. */ +struct snapshot_put +{ + REQUEST; + unsigned trailing; + struct raft_io_snapshot_put *req; + const struct raft_snapshot *snapshot; +}; + +/* Pending request to perform general work. */ +struct async_work +{ + REQUEST; + struct raft_io_async_work *req; +}; + +/* Pending request to load a snapshot. */ +struct snapshot_get +{ + REQUEST; + struct raft_io_snapshot_get *req; +}; + +/* Message that has been written to the network and is waiting to be delivered + * (or discarded). */ +struct transmit +{ + REQUEST; + struct raft_message message; /* Message to deliver */ + int timer; /* Deliver after this n of msecs. */ +}; + +/* Information about a peer server. */ +struct peer +{ + struct io *io; /* The peer's I/O backend. */ + bool connected; /* Whether a connection is established. */ + bool saturated; /* Whether the established connection is saturated. */ + unsigned send_latency; +}; + +/* Stub I/O implementation implementing all operations in-memory. */ +struct io +{ + struct raft_io *io; /* I/O object we're implementing. */ + unsigned index; /* Fixture server index. */ + raft_time *time; /* Global cluster time. */ + raft_time next_tick; /* Time the next tick should occurs. */ + + /* Term and vote */ + raft_term term; + raft_id voted_for; + + /* Log */ + struct raft_snapshot *snapshot; /* Latest snapshot */ + struct raft_entry *entries; /* Array or persisted entries */ + size_t n; /* Size of the persisted entries array */ + + /* Parameters passed via raft_io->init and raft_io->start */ + raft_id id; + const char *address; + unsigned tick_interval; + raft_io_tick_cb tick_cb; + raft_io_recv_cb recv_cb; + + /* Queue of pending asynchronous requests, whose callbacks still haven't + * been fired. */ + queue requests; + + /* Peers connected to us. */ + struct peer peers[MAX_PEERS]; + unsigned n_peers; + + unsigned + randomized_election_timeout; /* Value returned by io->random() */ + unsigned network_latency; /* Milliseconds to deliver RPCs */ + unsigned disk_latency; /* Milliseconds to perform disk I/O */ + unsigned work_duration; /* Milliseconds to run async work */ + + int append_fault_countdown; + int vote_fault_countdown; + int term_fault_countdown; + int send_fault_countdown; + + /* If flag i is true, messages of type i will be silently dropped. */ + bool drop[N_MESSAGE_TYPES]; + + /* Counters of events that happened so far. */ + unsigned n_send[N_MESSAGE_TYPES]; + unsigned n_recv[N_MESSAGE_TYPES]; + unsigned n_append; +}; + +static bool faultTick(int *countdown) +{ + bool trigger = *countdown == 0; + if (*countdown >= 0) { + *countdown -= 1; + } + return trigger; +} + +static int ioMethodInit(struct raft_io *raft_io, + raft_id id, + const char *address) +{ + struct io *io = raft_io->impl; + io->id = id; + io->address = address; + return 0; +} + +static int ioMethodStart(struct raft_io *raft_io, + unsigned msecs, + raft_io_tick_cb tick_cb, + raft_io_recv_cb recv_cb) +{ + struct io *io = raft_io->impl; + io->tick_interval = msecs; + io->tick_cb = tick_cb; + io->recv_cb = recv_cb; + io->next_tick = *io->time + io->tick_interval; + return 0; +} + +/* Flush an append entries request, appending its entries to the local in-memory + * log. */ +static void ioFlushAppend(struct io *s, struct append *append) +{ + struct raft_entry *entries; + unsigned i; + int status = 0; + + /* Simulates a disk write failure. */ + if (faultTick(&s->append_fault_countdown)) { + status = RAFT_IOERR; + goto done; + } + + /* Allocate an array for the old entries plus the new ones. */ + entries = + raft_realloc(s->entries, (s->n + append->n) * sizeof *s->entries); + assert(entries != NULL); + + /* Copy new entries into the new array. */ + for (i = 0; i < append->n; i++) { + const struct raft_entry *src = &append->entries[i]; + struct raft_entry *dst = &entries[s->n + i]; + int rv = entryCopy(src, dst); + assert(rv == 0); + } + + s->entries = entries; + s->n += append->n; + +done: + if (append->req->cb != NULL) { + append->req->cb(append->req, status); + } + raft_free(append); +} + +/* Flush a snapshot put request, copying the snapshot data. */ +static void ioFlushSnapshotPut(struct io *s, struct snapshot_put *r) +{ + int rv; + + if (s->snapshot == NULL) { + s->snapshot = raft_malloc(sizeof *s->snapshot); + assert(s->snapshot != NULL); + } else { + snapshotClose(s->snapshot); + } + + rv = snapshotCopy(r->snapshot, s->snapshot); + assert(rv == 0); + + if (r->trailing == 0) { + rv = s->io->truncate(s->io, 1); + assert(rv == 0); + } + + if (r->req->cb != NULL) { + r->req->cb(r->req, 0); + } + raft_free(r); +} + +/* Flush a snapshot get request, returning to the client a copy of the local + * snapshot (if any). */ +static void ioFlushSnapshotGet(struct io *s, struct snapshot_get *r) +{ + struct raft_snapshot *snapshot; + int rv; + snapshot = raft_malloc(sizeof *snapshot); + assert(snapshot != NULL); + rv = snapshotCopy(s->snapshot, snapshot); + assert(rv == 0); + r->req->cb(r->req, snapshot, 0); + raft_free(r); +} + +/* Flush an async work request */ +static void ioFlushAsyncWork(struct io *s, struct async_work *r) +{ + (void)s; + int rv; + rv = r->req->work(r->req); + r->req->cb(r->req, rv); + raft_free(r); +} + +/* Search for the peer with the given ID. */ +static struct peer *ioGetPeer(struct io *io, raft_id id) +{ + unsigned i; + for (i = 0; i < io->n_peers; i++) { + struct peer *peer = &io->peers[i]; + if (peer->io->id == id) { + return peer; + } + } + return NULL; +} + +/* Copy the dynamically allocated memory of an AppendEntries message. */ +static void copyAppendEntries(const struct raft_append_entries *src, + struct raft_append_entries *dst) +{ + int rv; + rv = entryBatchCopy(src->entries, &dst->entries, src->n_entries); + assert(rv == 0); + dst->n_entries = src->n_entries; +} + +/* Copy the dynamically allocated memory of an InstallSnapshot message. */ +static void copyInstallSnapshot(const struct raft_install_snapshot *src, + struct raft_install_snapshot *dst) +{ + int rv; + rv = configurationCopy(&src->conf, &dst->conf); + assert(rv == 0); + dst->data.base = raft_malloc(dst->data.len); + assert(dst->data.base != NULL); + memcpy(dst->data.base, src->data.base, src->data.len); +} + +/* Flush a raft_io_send request, copying the message content into a new struct + * transmit object and invoking the user callback. */ +static void ioFlushSend(struct io *io, struct send *send) +{ + struct peer *peer; + struct transmit *transmit; + struct raft_message *src; + struct raft_message *dst; + int status; + + /* If the peer doesn't exist or was disconnected, fail the request. */ + peer = ioGetPeer(io, send->message.server_id); + if (peer == NULL || !peer->connected) { + status = RAFT_NOCONNECTION; + goto out; + } + + transmit = raft_calloc(1, sizeof *transmit); + assert(transmit != NULL); + + transmit->type = TRANSMIT; + transmit->completion_time = *io->time + io->network_latency; + + src = &send->message; + dst = &transmit->message; + + QUEUE_PUSH(&io->requests, &transmit->queue); + + *dst = *src; + switch (dst->type) { + case RAFT_IO_APPEND_ENTRIES: + /* Make a copy of the entries being sent */ + copyAppendEntries(&src->append_entries, + &dst->append_entries); + break; + case RAFT_IO_INSTALL_SNAPSHOT: + copyInstallSnapshot(&src->install_snapshot, + &dst->install_snapshot); + break; + } + + io->n_send[send->message.type]++; + status = 0; + +out: + if (send->req->cb != NULL) { + send->req->cb(send->req, status); + } + + raft_free(send); +} + +/* Release the memory used by the given message transmit object. */ +static void ioDestroyTransmit(struct transmit *transmit) +{ + struct raft_message *message; + message = &transmit->message; + switch (message->type) { + case RAFT_IO_APPEND_ENTRIES: + if (message->append_entries.entries != NULL) { + raft_free( + message->append_entries.entries[0].batch); + raft_free(message->append_entries.entries); + } + break; + case RAFT_IO_INSTALL_SNAPSHOT: + raft_configuration_close( + &message->install_snapshot.conf); + raft_free(message->install_snapshot.data.base); + break; + } + raft_free(transmit); +} + +/* Flush all requests in the queue. */ +static void ioFlushAll(struct io *io) +{ + while (!QUEUE_IS_EMPTY(&io->requests)) { + queue *head; + struct ioRequest *r; + + head = QUEUE_HEAD(&io->requests); + QUEUE_REMOVE(head); + + r = QUEUE_DATA(head, struct ioRequest, queue); + switch (r->type) { + case APPEND: + ioFlushAppend(io, (struct append *)r); + break; + case SEND: + ioFlushSend(io, (struct send *)r); + break; + case TRANSMIT: + ioDestroyTransmit((struct transmit *)r); + break; + case SNAPSHOT_PUT: + ioFlushSnapshotPut(io, + (struct snapshot_put *)r); + break; + case SNAPSHOT_GET: + ioFlushSnapshotGet(io, + (struct snapshot_get *)r); + break; + case ASYNC_WORK: + ioFlushAsyncWork(io, (struct async_work *)r); + break; + default: + assert(0); + } + } +} + +static void ioMethodClose(struct raft_io *raft_io, raft_io_close_cb cb) +{ + if (cb != NULL) { + cb(raft_io); + } +} + +static int ioMethodLoad(struct raft_io *io, + raft_term *term, + raft_id *voted_for, + struct raft_snapshot **snapshot, + raft_index *start_index, + struct raft_entry **entries, + size_t *n_entries) +{ + struct io *s; + int rv; + + s = io->impl; + + *term = s->term; + *voted_for = s->voted_for; + *start_index = 1; + + *n_entries = s->n; + + /* Make a copy of the persisted entries, storing their data into a + * single batch. */ + rv = entryBatchCopy(s->entries, entries, s->n); + assert(rv == 0); + + if (s->snapshot != NULL) { + *snapshot = raft_malloc(sizeof **snapshot); + assert(*snapshot != NULL); + rv = snapshotCopy(s->snapshot, *snapshot); + assert(rv == 0); + *start_index = (*snapshot)->index + 1; + } else { + *snapshot = NULL; + } + + return 0; +} + +static int ioMethodBootstrap(struct raft_io *raft_io, + const struct raft_configuration *conf) +{ + struct io *io = raft_io->impl; + struct raft_buffer buf; + struct raft_entry *entries; + int rv; + + if (io->term != 0) { + return RAFT_CANTBOOTSTRAP; + } + + assert(io->voted_for == 0); + assert(io->snapshot == NULL); + assert(io->entries == NULL); + assert(io->n == 0); + + /* Encode the given configuration. */ + rv = configurationEncode(conf, &buf); + if (rv != 0) { + return rv; + } + + entries = raft_calloc(1, sizeof *io->entries); + if (entries == NULL) { + return RAFT_NOMEM; + } + + entries[0].term = 1; + entries[0].type = RAFT_CHANGE; + entries[0].buf = buf; + + io->term = 1; + io->voted_for = 0; + io->snapshot = NULL; + io->entries = entries; + io->n = 1; + + return 0; +} + +static int ioMethodRecover(struct raft_io *io, + const struct raft_configuration *conf) +{ + /* TODO: implement this API */ + (void)io; + (void)conf; + return RAFT_IOERR; +} + +static int ioMethodSetTerm(struct raft_io *raft_io, const raft_term term) +{ + struct io *io = raft_io->impl; + + if (faultTick(&io->term_fault_countdown)) { + return RAFT_IOERR; + } + + io->term = term; + io->voted_for = 0; + + return 0; +} + +static int ioMethodSetVote(struct raft_io *raft_io, const raft_id server_id) +{ + struct io *io = raft_io->impl; + + if (faultTick(&io->vote_fault_countdown)) { + return RAFT_IOERR; + } + + io->voted_for = server_id; + + return 0; +} + +static int ioMethodAppend(struct raft_io *raft_io, + struct raft_io_append *req, + const struct raft_entry entries[], + unsigned n, + raft_io_append_cb cb) +{ + struct io *io = raft_io->impl; + struct append *r; + + r = raft_malloc(sizeof *r); + assert(r != NULL); + + r->type = APPEND; + r->completion_time = *io->time + io->disk_latency; + r->req = req; + r->entries = entries; + r->n = n; + + req->cb = cb; + + QUEUE_PUSH(&io->requests, &r->queue); + + return 0; +} + +static int ioMethodTruncate(struct raft_io *raft_io, raft_index index) +{ + struct io *io = raft_io->impl; + size_t n; + + n = (size_t)(index - 1); /* Number of entries left after truncation */ + + if (n > 0) { + struct raft_entry *entries; + + /* Create a new array of entries holding the non-truncated + * entries */ + entries = raft_malloc(n * sizeof *entries); + if (entries == NULL) { + return RAFT_NOMEM; + } + memcpy(entries, io->entries, n * sizeof *io->entries); + + /* Release any truncated entry */ + if (io->entries != NULL) { + size_t i; + for (i = n; i < io->n; i++) { + raft_free(io->entries[i].buf.base); + } + raft_free(io->entries); + } + io->entries = entries; + } else { + /* Release everything we have */ + if (io->entries != NULL) { + size_t i; + for (i = 0; i < io->n; i++) { + raft_free(io->entries[i].buf.base); + } + raft_free(io->entries); + io->entries = NULL; + } + } + + io->n = n; + + return 0; +} + +static int ioMethodSnapshotPut(struct raft_io *raft_io, + unsigned trailing, + struct raft_io_snapshot_put *req, + const struct raft_snapshot *snapshot, + raft_io_snapshot_put_cb cb) +{ + struct io *io = raft_io->impl; + struct snapshot_put *r; + + r = raft_malloc(sizeof *r); + assert(r != NULL); + + r->type = SNAPSHOT_PUT; + r->req = req; + r->req->cb = cb; + r->snapshot = snapshot; + r->completion_time = *io->time + io->disk_latency; + r->trailing = trailing; + + QUEUE_PUSH(&io->requests, &r->queue); + + return 0; +} + +static int ioMethodAsyncWork(struct raft_io *raft_io, + struct raft_io_async_work *req, + raft_io_async_work_cb cb) +{ + struct io *io = raft_io->impl; + struct async_work *r; + + r = raft_malloc(sizeof *r); + assert(r != NULL); + + r->type = ASYNC_WORK; + r->req = req; + r->req->cb = cb; + r->completion_time = *io->time + io->work_duration; + + QUEUE_PUSH(&io->requests, &r->queue); + return 0; +} + +static int ioMethodSnapshotGet(struct raft_io *raft_io, + struct raft_io_snapshot_get *req, + raft_io_snapshot_get_cb cb) +{ + struct io *io = raft_io->impl; + struct snapshot_get *r; + + r = raft_malloc(sizeof *r); + assert(r != NULL); + + r->type = SNAPSHOT_GET; + r->req = req; + r->req->cb = cb; + r->completion_time = *io->time + io->disk_latency; + + QUEUE_PUSH(&io->requests, &r->queue); + + return 0; +} + +static raft_time ioMethodTime(struct raft_io *raft_io) +{ + struct io *io = raft_io->impl; + return *io->time; +} + +static int ioMethodRandom(struct raft_io *raft_io, int min, int max) +{ + struct io *io = raft_io->impl; + int t = (int)io->randomized_election_timeout; + if (t < min) { + return min; + } else if (t > max) { + return max; + } else { + return t; + } +} + +/* Queue up a request which will be processed later, when io_stub_flush() + * is invoked. */ +static int ioMethodSend(struct raft_io *raft_io, + struct raft_io_send *req, + const struct raft_message *message, + raft_io_send_cb cb) +{ + struct io *io = raft_io->impl; + struct send *r; + struct peer *peer; + + if (faultTick(&io->send_fault_countdown)) { + return RAFT_IOERR; + } + + r = raft_malloc(sizeof *r); + assert(r != NULL); + + r->type = SEND; + r->req = req; + r->message = *message; + r->req->cb = cb; + + peer = ioGetPeer(io, message->server_id); + r->completion_time = *io->time + peer->send_latency; + + QUEUE_PUSH(&io->requests, &r->queue); + + return 0; +} + +static void ioReceive(struct io *io, struct raft_message *message) +{ + io->recv_cb(io->io, message); + io->n_recv[message->type]++; +} + +static void ioDeliverTransmit(struct io *io, struct transmit *transmit) +{ + struct raft_message *message = &transmit->message; + struct peer *peer; /* Destination peer */ + + /* If this message type is in the drop list, let's discard it */ + if (io->drop[message->type - 1]) { + ioDestroyTransmit(transmit); + return; + } + + peer = ioGetPeer(io, message->server_id); + + /* We don't have any peer with this ID or it's disconnected or if the + * connection is saturated, let's drop the message */ + if (peer == NULL || !peer->connected || peer->saturated) { + ioDestroyTransmit(transmit); + return; + } + + /* Update the message object with our details. */ + message->server_id = io->id; + message->server_address = io->address; + + ioReceive(peer->io, message); + raft_free(transmit); +} + +/* Connect @raft_io to @other, enabling delivery of messages sent from @io to + * @other. + */ +static void ioConnect(struct raft_io *raft_io, struct raft_io *other) +{ + struct io *io = raft_io->impl; + struct io *io_other = other->impl; + assert(io->n_peers < MAX_PEERS); + io->peers[io->n_peers].io = io_other; + io->peers[io->n_peers].connected = true; + io->peers[io->n_peers].saturated = false; + io->peers[io->n_peers].send_latency = SEND_LATENCY; + io->n_peers++; +} + +/* Return whether the connection with the given peer is saturated. */ +static bool ioSaturated(struct raft_io *raft_io, struct raft_io *other) +{ + struct io *io = raft_io->impl; + struct io *io_other = other->impl; + struct peer *peer; + peer = ioGetPeer(io, io_other->id); + return peer != NULL && peer->saturated; +} + +/* Disconnect @raft_io and @other, causing calls to @io->send() to fail + * asynchronously when sending messages to @other. */ +static void ioDisconnect(struct raft_io *raft_io, struct raft_io *other) +{ + struct io *io = raft_io->impl; + struct io *io_other = other->impl; + struct peer *peer; + peer = ioGetPeer(io, io_other->id); + assert(peer != NULL); + peer->connected = false; +} + +/* Reconnect @raft_io and @other. */ +static void ioReconnect(struct raft_io *raft_io, struct raft_io *other) +{ + struct io *io = raft_io->impl; + struct io *io_other = other->impl; + struct peer *peer; + peer = ioGetPeer(io, io_other->id); + assert(peer != NULL); + peer->connected = true; +} + +/* Saturate the connection from @io to @other, causing messages sent from @io to + * @other to be dropped. */ +static void ioSaturate(struct raft_io *io, struct raft_io *other) +{ + struct io *s; + struct io *s_other; + struct peer *peer; + s = io->impl; + s_other = other->impl; + peer = ioGetPeer(s, s_other->id); + assert(peer != NULL && peer->connected); + peer->saturated = true; +} + +/* Desaturate the connection from @raft_io to @other, re-enabling delivery of + * messages sent from @raft_io to @other. */ +static void ioDesaturate(struct raft_io *raft_io, struct raft_io *other) +{ + struct io *io = raft_io->impl; + struct io *io_other = other->impl; + struct peer *peer; + peer = ioGetPeer(io, io_other->id); + assert(peer != NULL && peer->connected); + peer->saturated = false; +} + +/* Enable or disable silently dropping all outgoing messages of type @type. */ +void ioDrop(struct io *io, int type, bool flag) +{ + io->drop[type - 1] = flag; +} + +static int ioInit(struct raft_io *raft_io, unsigned index, raft_time *time) +{ + struct io *io; + io = raft_malloc(sizeof *io); + assert(io != NULL); + io->io = raft_io; + io->index = index; + io->time = time; + io->term = 0; + io->voted_for = 0; + io->snapshot = NULL; + io->entries = NULL; + io->n = 0; + QUEUE_INIT(&io->requests); + io->n_peers = 0; + io->randomized_election_timeout = ELECTION_TIMEOUT + index * 100; + io->network_latency = NETWORK_LATENCY; + io->disk_latency = DISK_LATENCY; + io->work_duration = WORK_DURATION; + io->append_fault_countdown = -1; + io->vote_fault_countdown = -1; + io->term_fault_countdown = -1; + io->send_fault_countdown = -1; + memset(io->drop, 0, sizeof io->drop); + memset(io->n_send, 0, sizeof io->n_send); + memset(io->n_recv, 0, sizeof io->n_recv); + io->n_append = 0; + + raft_io->impl = io; + raft_io->version = 2; + raft_io->init = ioMethodInit; + raft_io->close = ioMethodClose; + raft_io->start = ioMethodStart; + raft_io->load = ioMethodLoad; + raft_io->bootstrap = ioMethodBootstrap; + raft_io->recover = ioMethodRecover; + raft_io->set_term = ioMethodSetTerm; + raft_io->set_vote = ioMethodSetVote; + raft_io->append = ioMethodAppend; + raft_io->truncate = ioMethodTruncate; + raft_io->send = ioMethodSend; + raft_io->snapshot_put = ioMethodSnapshotPut; + raft_io->async_work = ioMethodAsyncWork; + raft_io->snapshot_get = ioMethodSnapshotGet; + raft_io->time = ioMethodTime; + raft_io->random = ioMethodRandom; + + return 0; +} + +/* Release all memory held by the given stub I/O implementation. */ +void ioClose(struct raft_io *raft_io) +{ + struct io *io = raft_io->impl; + size_t i; + for (i = 0; i < io->n; i++) { + struct raft_entry *entry = &io->entries[i]; + raft_free(entry->buf.base); + } + if (io->entries != NULL) { + raft_free(io->entries); + } + if (io->snapshot != NULL) { + snapshotClose(io->snapshot); + raft_free(io->snapshot); + } + raft_free(io); +} + +/* Custom emit tracer function which include the server ID. */ +static void emit(struct raft_tracer *t, + const char *file, + unsigned int line, + const char *func, + unsigned int level, + const char *message) +{ + unsigned id = *(unsigned *)t->impl; + (void)func; + (void)level; + fprintf(stderr, "%d: %30s:%*d - %s\n", id, file, 3, line, message); +} + +static int serverInit(struct raft_fixture *f, unsigned i, struct raft_fsm *fsm) +{ + int rv; + struct raft_fixture_server *s; + s = raft_malloc(sizeof(*s)); + if (s == NULL) { + return RAFT_NOMEM; + } + f->servers[i] = s; + s->alive = true; + s->id = i + 1; + sprintf(s->address, "%llu", s->id); + rv = ioInit(&s->io, i, &f->time); + if (rv != 0) { + return rv; + } + rv = raft_init(&s->raft, &s->io, fsm, s->id, s->address); + if (rv != 0) { + return rv; + } + raft_set_election_timeout(&s->raft, ELECTION_TIMEOUT); + raft_set_heartbeat_timeout(&s->raft, HEARTBEAT_TIMEOUT); + raft_set_install_snapshot_timeout(&s->raft, INSTALL_SNAPSHOT_TIMEOUT); + s->tracer.impl = (void *)&s->id; + s->tracer.emit = emit; + s->raft.tracer = NULL; + return 0; +} + +static void serverClose(struct raft_fixture_server *s) +{ + raft_close(&s->raft, NULL); + ioClose(&s->io); + raft_free(s); +} + +/* Connect the server with the given index to all others */ +static void serverConnectToAll(struct raft_fixture *f, unsigned i) +{ + unsigned j; + for (j = 0; j < f->n; j++) { + struct raft_io *io1 = &f->servers[i]->io; + struct raft_io *io2 = &f->servers[j]->io; + if (i == j) { + continue; + } + ioConnect(io1, io2); + } +} + +int raft_fixture_init(struct raft_fixture *f) +{ + f->time = 0; + f->n = 0; + f->log = logInit(); + if (f->log == NULL) { + return RAFT_NOMEM; + } + f->commit_index = 0; + f->hook = NULL; + f->event = raft_malloc(sizeof(*f->event)); + if (f->event == NULL) { + return RAFT_NOMEM; + } + return 0; +} + +void raft_fixture_close(struct raft_fixture *f) +{ + unsigned i; + for (i = 0; i < f->n; i++) { + struct io *io = f->servers[i]->io.impl; + ioFlushAll(io); + } + for (i = 0; i < f->n; i++) { + serverClose(f->servers[i]); + } + raft_free(f->event); + logClose(f->log); +} + +int raft_fixture_configuration(struct raft_fixture *f, + unsigned n_voting, + struct raft_configuration *configuration) +{ + unsigned i; + assert(f->n > 0); + assert(n_voting > 0); + assert(n_voting <= f->n); + raft_configuration_init(configuration); + for (i = 0; i < f->n; i++) { + struct raft_fixture_server *s; + int role = i < n_voting ? RAFT_VOTER : RAFT_STANDBY; + int rv; + s = f->servers[i]; + rv = raft_configuration_add(configuration, s->id, s->address, + role); + if (rv != 0) { + return rv; + } + } + return 0; +} + +int raft_fixture_bootstrap(struct raft_fixture *f, + struct raft_configuration *configuration) +{ + unsigned i; + for (i = 0; i < f->n; i++) { + struct raft *raft = raft_fixture_get(f, i); + int rv; + rv = raft_bootstrap(raft, configuration); + if (rv != 0) { + return rv; + } + } + return 0; +} + +int raft_fixture_start(struct raft_fixture *f) +{ + unsigned i; + int rv; + for (i = 0; i < f->n; i++) { + struct raft_fixture_server *s = f->servers[i]; + rv = raft_start(&s->raft); + if (rv != 0) { + return rv; + } + } + return 0; +} + +unsigned raft_fixture_n(struct raft_fixture *f) +{ + return f->n; +} + +raft_time raft_fixture_time(struct raft_fixture *f) +{ + return f->time; +} + +struct raft *raft_fixture_get(struct raft_fixture *f, unsigned i) +{ + assert(i < f->n); + return &f->servers[i]->raft; +} + +bool raft_fixture_alive(struct raft_fixture *f, unsigned i) +{ + assert(i < f->n); + return f->servers[i]->alive; +} + +unsigned raft_fixture_leader_index(struct raft_fixture *f) +{ + if (f->leader_id != 0) { + return (unsigned)(f->leader_id - 1); + } + return f->n; +} + +raft_id raft_fixture_voted_for(struct raft_fixture *f, unsigned i) +{ + struct io *io = f->servers[i]->io.impl; + return io->voted_for; +} + +/* Update the leader and check for election safety. + * + * From figure 3.2: + * + * Election Safety -> At most one leader can be elected in a given + * term. + * + * Return true if the current leader turns out to be different from the one at + * the time this function was called. + */ +static bool updateLeaderAndCheckElectionSafety(struct raft_fixture *f) +{ + raft_id leader_id = 0; + unsigned leader_i = 0; + raft_term leader_term = 0; + unsigned i; + bool changed; + + for (i = 0; i < f->n; i++) { + struct raft *raft = raft_fixture_get(f, i); + unsigned j; + + /* If the server is not alive or is not the leader, skip to the + * next server. */ + if (!raft_fixture_alive(f, i) || + raft_state(raft) != RAFT_LEADER) { + continue; + } + + /* Check that no other server is leader for this term. */ + for (j = 0; j < f->n; j++) { + struct raft *other = raft_fixture_get(f, j); + + if (other->id == raft->id || + other->state != RAFT_LEADER) { + continue; + } + + if (other->current_term == raft->current_term) { + fprintf(stderr, + "server %llu and %llu are both leaders " + "in term %llu", + raft->id, other->id, + raft->current_term); + abort(); + } + } + + if (raft->current_term > leader_term) { + leader_id = raft->id; + leader_i = i; + leader_term = raft->current_term; + } + } + + /* Check that the leader is stable, in the sense that it has been + * acknowledged by all alive servers connected to it, and those servers + * together with the leader form a majority. */ + if (leader_id != 0) { + unsigned n_acks = 0; + bool acked = true; + unsigned n_quorum = 0; + + for (i = 0; i < f->n; i++) { + struct raft *raft = raft_fixture_get(f, i); + const struct raft_server *server = + configurationGet(&raft->configuration, raft->id); + + /* If the server is not in the configuration or is idle, + * then don't count it. */ + if (server == NULL || server->role == RAFT_SPARE) { + continue; + } + + n_quorum++; + + /* If this server is itself the leader, or it's not + * alive or it's not connected to the leader, then don't + * count it in for stability. */ + if (i == leader_i || !raft_fixture_alive(f, i) || + raft_fixture_saturated(f, leader_i, i)) { + continue; + } + + if (raft->current_term != leader_term) { + acked = false; + break; + } + + if (raft->state != RAFT_FOLLOWER) { + acked = false; + break; + } + + if (raft->follower_state.current_leader.id == 0) { + acked = false; + break; + } + + if (raft->follower_state.current_leader.id != + leader_id) { + acked = false; + break; + } + + n_acks++; + } + + if (!acked || n_acks < (n_quorum / 2)) { + leader_id = 0; + } + } + + changed = leader_id != f->leader_id; + f->leader_id = leader_id; + + return changed; +} + +/* Check for leader append-only. + * + * From figure 3.2: + * + * Leader Append-Only -> A leader never overwrites or deletes entries in its + * own log; it only appends new entries. + */ +static void checkLeaderAppendOnly(struct raft_fixture *f) +{ + struct raft *raft; + raft_index index; + raft_index last = logLastIndex(f->log); + + /* If the cached log is empty it means there was no leader before. */ + if (last == 0) { + return; + } + + /* If there's no new leader, just return. */ + if (f->leader_id == 0) { + return; + } + + raft = raft_fixture_get(f, (unsigned)f->leader_id - 1); + last = logLastIndex(f->log); + + for (index = 1; index <= last; index++) { + const struct raft_entry *entry1; + const struct raft_entry *entry2; + size_t i; + + entry1 = logGet(f->log, index); + entry2 = logGet(raft->log, index); + + assert(entry1 != NULL); + + /* Check if the entry was snapshotted. */ + if (entry2 == NULL) { + assert(raft->log->snapshot.last_index >= index); + continue; + } + + /* Entry was not overwritten. */ + assert(entry1->type == entry2->type); + assert(entry1->term == entry2->term); + for (i = 0; i < entry1->buf.len; i++) { + assert(((uint8_t *)entry1->buf.base)[i] == + ((uint8_t *)entry2->buf.base)[i]); + } + } +} + +/* Make a copy of the the current leader log, in order to perform the Leader + * Append-Only check at the next iteration. */ +static void copyLeaderLog(struct raft_fixture *f) +{ + struct raft *raft = raft_fixture_get(f, (unsigned)f->leader_id - 1); + struct raft_entry *entries; + unsigned n; + size_t i; + int rv; + logClose(f->log); + f->log = logInit(); + if (f->log == NULL) { + assert(false); + return; + } + + rv = logAcquire(raft->log, 1, &entries, &n); + assert(rv == 0); + for (i = 0; i < n; i++) { + struct raft_entry *entry = &entries[i]; + struct raft_buffer buf; + buf.len = entry->buf.len; + buf.base = raft_malloc(buf.len); + assert(buf.base != NULL); + memcpy(buf.base, entry->buf.base, buf.len); + rv = logAppend(f->log, entry->term, entry->type, &buf, NULL); + assert(rv == 0); + } + logRelease(raft->log, 1, entries, n); +} + +/* Update the commit index to match the one from the current leader. */ +static void updateCommitIndex(struct raft_fixture *f) +{ + struct raft *raft = raft_fixture_get(f, (unsigned)f->leader_id - 1); + if (raft->commit_index > f->commit_index) { + f->commit_index = raft->commit_index; + } +} + +/* Return the lowest tick time across all servers, along with the associated + * server index */ +static void getLowestTickTime(struct raft_fixture *f, raft_time *t, unsigned *i) +{ + unsigned j; + *t = (raft_time)-1 /* Maximum value */; + for (j = 0; j < f->n; j++) { + struct io *io = f->servers[j]->io.impl; + if (io->next_tick < *t) { + *t = io->next_tick; + *i = j; + } + } +} + +/* Return the completion time of the request with the lowest completion time + * across all servers, along with the associated server index. */ +static void getLowestRequestCompletionTime(struct raft_fixture *f, + raft_time *t, + unsigned *i) +{ + unsigned j; + *t = (raft_time)-1 /* Maximum value */; + for (j = 0; j < f->n; j++) { + struct io *io = f->servers[j]->io.impl; + queue *head; + QUEUE_FOREACH(head, &io->requests) + { + struct ioRequest *r = + QUEUE_DATA(head, struct ioRequest, queue); + if (r->completion_time < *t) { + *t = r->completion_time; + *i = j; + } + } + } +} + +/* Fire the tick callback of the i'th server. */ +static void fireTick(struct raft_fixture *f, unsigned i) +{ + struct io *io = f->servers[i]->io.impl; + f->time = io->next_tick; + f->event->server_index = i; + f->event->type = RAFT_FIXTURE_TICK; + io->next_tick += io->tick_interval; + if (f->servers[i]->alive) { + io->tick_cb(io->io); + } +} + +/* Complete the first request with completion time @t on the @i'th server. */ +static void completeRequest(struct raft_fixture *f, unsigned i, raft_time t) +{ + struct io *io = f->servers[i]->io.impl; + queue *head; + struct ioRequest *r = NULL; + bool found = false; + f->time = t; + f->event->server_index = i; + QUEUE_FOREACH(head, &io->requests) + { + r = QUEUE_DATA(head, struct ioRequest, queue); + if (r->completion_time == t) { + found = true; + break; + } + } + assert(found); + QUEUE_REMOVE(head); + switch (r->type) { + case APPEND: + ioFlushAppend(io, (struct append *)r); + f->event->type = RAFT_FIXTURE_DISK; + break; + case SEND: + ioFlushSend(io, (struct send *)r); + f->event->type = RAFT_FIXTURE_NETWORK; + break; + case TRANSMIT: + ioDeliverTransmit(io, (struct transmit *)r); + f->event->type = RAFT_FIXTURE_NETWORK; + break; + case SNAPSHOT_PUT: + ioFlushSnapshotPut(io, (struct snapshot_put *)r); + f->event->type = RAFT_FIXTURE_DISK; + break; + case SNAPSHOT_GET: + ioFlushSnapshotGet(io, (struct snapshot_get *)r); + f->event->type = RAFT_FIXTURE_DISK; + break; + case ASYNC_WORK: + ioFlushAsyncWork(io, (struct async_work *)r); + f->event->type = RAFT_FIXTURE_WORK; + break; + default: + assert(0); + } +} + +struct raft_fixture_event *raft_fixture_step(struct raft_fixture *f) +{ + raft_time tick_time; + raft_time completion_time; + unsigned i = f->n; + unsigned j = f->n; + + getLowestTickTime(f, &tick_time, &i); + getLowestRequestCompletionTime(f, &completion_time, &j); + + assert(i < f->n || j < f->n); + + if (tick_time < completion_time || + (tick_time == completion_time && i <= j)) { + fireTick(f, i); + } else { + completeRequest(f, j, completion_time); + } + + /* If the leader has not changed check the Leader Append-Only + * guarantee. */ + if (!updateLeaderAndCheckElectionSafety(f)) { + checkLeaderAppendOnly(f); + } + + /* If we have a leader, update leader-related state . */ + if (f->leader_id != 0) { + copyLeaderLog(f); + updateCommitIndex(f); + } + + if (f->hook != NULL) { + f->hook(f, f->event); + } + + return f->event; +} + +struct raft_fixture_event *raft_fixture_step_n(struct raft_fixture *f, + unsigned n) +{ + unsigned i; + assert(n > 0); + for (i = 0; i < n - 1; i++) { + raft_fixture_step(f); + } + return raft_fixture_step(f); +} + +bool raft_fixture_step_until(struct raft_fixture *f, + bool (*stop)(struct raft_fixture *f, void *arg), + void *arg, + unsigned max_msecs) +{ + raft_time start = f->time; + while (!stop(f, arg) && (f->time - start) < max_msecs) { + raft_fixture_step(f); + } + return f->time - start < max_msecs; +} + +/* A step function which return always false, forcing raft_fixture_step_n to + * advance time at each iteration. */ +static bool spin(struct raft_fixture *f, void *arg) +{ + (void)f; + (void)arg; + return false; +} + +void raft_fixture_step_until_elapsed(struct raft_fixture *f, unsigned msecs) +{ + raft_fixture_step_until(f, spin, NULL, msecs); +} + +static bool hasLeader(struct raft_fixture *f, void *arg) +{ + (void)arg; + return f->leader_id != 0; +} + +bool raft_fixture_step_until_has_leader(struct raft_fixture *f, + unsigned max_msecs) +{ + return raft_fixture_step_until(f, hasLeader, NULL, max_msecs); +} + +static bool hasNoLeader(struct raft_fixture *f, void *arg) +{ + (void)arg; + return f->leader_id == 0; +} + +bool raft_fixture_step_until_has_no_leader(struct raft_fixture *f, + unsigned max_msecs) +{ + return raft_fixture_step_until(f, hasNoLeader, NULL, max_msecs); +} + +/* Enable/disable dropping outgoing messages of a certain type from all servers + * except one. */ +static void dropAllExcept(struct raft_fixture *f, + int type, + bool flag, + unsigned i) +{ + unsigned j; + for (j = 0; j < f->n; j++) { + struct raft_fixture_server *s = f->servers[j]; + if (j == i) { + continue; + } + ioDrop(s->io.impl, type, flag); + } +} + +/* Set the randomized election timeout of the given server to the minimum value + * compatible with its current state and timers. */ +static void minimizeRandomizedElectionTimeout(struct raft_fixture *f, + unsigned i) +{ + struct raft *raft = &f->servers[i]->raft; + raft_time now = raft->io->time(raft->io); + unsigned timeout = raft->election_timeout; + assert(raft->state == RAFT_FOLLOWER); + + /* If the minimum election timeout value would make the timer expire in + * the past, cap it. */ + if (now - raft->election_timer_start > timeout) { + timeout = (unsigned)(now - raft->election_timer_start); + } + + raft->follower_state.randomized_election_timeout = timeout; +} + +/* Set the randomized election timeout to the maximum value on all servers + * except the given one. */ +static void maximizeAllRandomizedElectionTimeoutsExcept(struct raft_fixture *f, + unsigned i) +{ + unsigned j; + for (j = 0; j < f->n; j++) { + struct raft *raft = &f->servers[j]->raft; + unsigned timeout = raft->election_timeout * 2; + if (j == i) { + continue; + } + assert(raft->state == RAFT_FOLLOWER); + raft->follower_state.randomized_election_timeout = timeout; + } +} + +void raft_fixture_hook(struct raft_fixture *f, raft_fixture_event_cb hook) +{ + f->hook = hook; +} + +void raft_fixture_start_elect(struct raft_fixture *f, unsigned i) +{ + struct raft *raft = raft_fixture_get(f, i); + unsigned j; + + /* Make sure there's currently no leader. */ + assert(f->leader_id == 0); + + /* Make sure that the given server is voting. */ + assert(configurationGet(&raft->configuration, raft->id)->role == + RAFT_VOTER); + + /* Make sure all servers are currently followers. */ + for (j = 0; j < f->n; j++) { + assert(raft_state(&f->servers[j]->raft) == RAFT_FOLLOWER); + } + + /* Pretend that the last randomized election timeout was set at the + * maximum value on all server expect the one to be elected, which is + * instead set to the minimum possible value compatible with its current + * state. */ + minimizeRandomizedElectionTimeout(f, i); + maximizeAllRandomizedElectionTimeoutsExcept(f, i); +} + +void raft_fixture_elect(struct raft_fixture *f, unsigned i) +{ + struct raft *raft = raft_fixture_get(f, i); + raft_fixture_start_elect(f, i); + raft_fixture_step_until_has_leader(f, ELECTION_TIMEOUT * 20); + assert(f->leader_id == raft->id); +} + +void raft_fixture_depose(struct raft_fixture *f) +{ + unsigned leader_i; + + /* Make sure there's a leader. */ + assert(f->leader_id != 0); + leader_i = (unsigned)f->leader_id - 1; + assert(raft_state(&f->servers[leader_i]->raft) == RAFT_LEADER); + + /* Set a very large election timeout on all followers, to prevent them + * from starting an election. */ + maximizeAllRandomizedElectionTimeoutsExcept(f, leader_i); + + /* Prevent all servers from sending append entries results, so the + * leader will eventually step down. */ + dropAllExcept(f, RAFT_IO_APPEND_ENTRIES_RESULT, true, leader_i); + + raft_fixture_step_until_has_no_leader(f, ELECTION_TIMEOUT * 3); + assert(f->leader_id == 0); + + dropAllExcept(f, RAFT_IO_APPEND_ENTRIES_RESULT, false, leader_i); +} + +struct step_apply +{ + unsigned i; + raft_index index; +}; + +static bool hasAppliedIndex(struct raft_fixture *f, void *arg) +{ + struct step_apply *apply = (struct step_apply *)arg; + struct raft *raft; + unsigned n = 0; + unsigned i; + + if (apply->i < f->n) { + raft = raft_fixture_get(f, apply->i); + return raft_last_applied(raft) >= apply->index; + } + + for (i = 0; i < f->n; i++) { + raft = raft_fixture_get(f, i); + if (raft_last_applied(raft) >= apply->index) { + n++; + } + } + return n == f->n; +} + +bool raft_fixture_step_until_applied(struct raft_fixture *f, + unsigned i, + raft_index index, + unsigned max_msecs) +{ + struct step_apply apply = {i, index}; + return raft_fixture_step_until(f, hasAppliedIndex, &apply, max_msecs); +} + +struct step_state +{ + unsigned i; + int state; +}; + +static bool hasState(struct raft_fixture *f, void *arg) +{ + struct step_state *target = (struct step_state *)arg; + struct raft *raft; + raft = raft_fixture_get(f, target->i); + return raft_state(raft) == target->state; +} + +bool raft_fixture_step_until_state_is(struct raft_fixture *f, + unsigned i, + int state, + unsigned max_msecs) +{ + struct step_state target = {i, state}; + return raft_fixture_step_until(f, hasState, &target, max_msecs); +} + +struct step_term +{ + unsigned i; + raft_term term; +}; + +static bool hasTerm(struct raft_fixture *f, void *arg) +{ + struct step_term *target = (struct step_term *)arg; + struct raft *raft; + raft = raft_fixture_get(f, target->i); + return raft->current_term == target->term; +} + +bool raft_fixture_step_until_term_is(struct raft_fixture *f, + unsigned i, + raft_term term, + unsigned max_msecs) +{ + struct step_term target = {i, term}; + return raft_fixture_step_until(f, hasTerm, &target, max_msecs); +} + +struct step_vote +{ + unsigned i; + unsigned j; +}; + +static bool hasVotedFor(struct raft_fixture *f, void *arg) +{ + struct step_vote *target = (struct step_vote *)arg; + struct raft *raft; + raft = raft_fixture_get(f, target->i); + return raft->voted_for == target->j + 1; +} + +bool raft_fixture_step_until_voted_for(struct raft_fixture *f, + unsigned i, + unsigned j, + unsigned max_msecs) +{ + struct step_vote target = {i, j}; + return raft_fixture_step_until(f, hasVotedFor, &target, max_msecs); +} + +struct step_deliver +{ + unsigned i; + unsigned j; +}; + +static bool hasDelivered(struct raft_fixture *f, void *arg) +{ + struct step_deliver *target = (struct step_deliver *)arg; + struct raft *raft; + struct io *io; + struct raft_message *message; + queue *head; + raft = raft_fixture_get(f, target->i); + io = raft->io->impl; + QUEUE_FOREACH(head, &io->requests) + { + struct ioRequest *r; + r = QUEUE_DATA(head, struct ioRequest, queue); + message = NULL; + switch (r->type) { + case SEND: + message = &((struct send *)r)->message; + break; + case TRANSMIT: + message = &((struct transmit *)r)->message; + break; + } + if (message != NULL && message->server_id == target->j + 1) { + return false; + } + } + return true; +} + +bool raft_fixture_step_until_delivered(struct raft_fixture *f, + unsigned i, + unsigned j, + unsigned max_msecs) +{ + struct step_deliver target = {i, j}; + return raft_fixture_step_until(f, hasDelivered, &target, max_msecs); +} + +void raft_fixture_disconnect(struct raft_fixture *f, unsigned i, unsigned j) +{ + struct raft_io *io1 = &f->servers[i]->io; + struct raft_io *io2 = &f->servers[j]->io; + ioDisconnect(io1, io2); +} + +void raft_fixture_reconnect(struct raft_fixture *f, unsigned i, unsigned j) +{ + struct raft_io *io1 = &f->servers[i]->io; + struct raft_io *io2 = &f->servers[j]->io; + ioReconnect(io1, io2); +} + +void raft_fixture_saturate(struct raft_fixture *f, unsigned i, unsigned j) +{ + struct raft_io *io1 = &f->servers[i]->io; + struct raft_io *io2 = &f->servers[j]->io; + ioSaturate(io1, io2); +} + +static void disconnectFromAll(struct raft_fixture *f, unsigned i) +{ + unsigned j; + for (j = 0; j < f->n; j++) { + if (j == i) { + continue; + } + raft_fixture_saturate(f, i, j); + raft_fixture_saturate(f, j, i); + } +} + +static void reconnectToAll(struct raft_fixture *f, unsigned i) +{ + unsigned j; + for (j = 0; j < f->n; j++) { + if (j == i) { + continue; + } + /* Don't reconnect to disconnected peers */ + if (!f->servers[j]->alive) { + continue; + } + raft_fixture_desaturate(f, i, j); + raft_fixture_desaturate(f, j, i); + } +} + +bool raft_fixture_saturated(struct raft_fixture *f, unsigned i, unsigned j) +{ + struct raft_io *io1 = &f->servers[i]->io; + struct raft_io *io2 = &f->servers[j]->io; + return ioSaturated(io1, io2); +} + +void raft_fixture_desaturate(struct raft_fixture *f, unsigned i, unsigned j) +{ + struct raft_io *io1 = &f->servers[i]->io; + struct raft_io *io2 = &f->servers[j]->io; + ioDesaturate(io1, io2); +} + +void raft_fixture_kill(struct raft_fixture *f, unsigned i) +{ + disconnectFromAll(f, i); + f->servers[i]->alive = false; +} + +void raft_fixture_revive(struct raft_fixture *f, unsigned i) +{ + reconnectToAll(f, i); + f->servers[i]->alive = true; +} + +int raft_fixture_grow(struct raft_fixture *f, struct raft_fsm *fsm) +{ + unsigned i; + unsigned j; + int rc; + i = f->n; + f->n++; + + rc = serverInit(f, i, fsm); + if (rc != 0) { + return rc; + } + + serverConnectToAll(f, i); + for (j = 0; j < f->n; j++) { + struct raft_io *io1 = &f->servers[i]->io; + struct raft_io *io2 = &f->servers[j]->io; + ioConnect(io2, io1); + } + + return 0; +} + +void raft_fixture_set_randomized_election_timeout(struct raft_fixture *f, + unsigned i, + unsigned msecs) +{ + struct io *io = f->servers[i]->io.impl; + io->randomized_election_timeout = msecs; +} + +void raft_fixture_set_network_latency(struct raft_fixture *f, + unsigned i, + unsigned msecs) +{ + struct io *io = f->servers[i]->io.impl; + io->network_latency = msecs; +} + +void raft_fixture_set_disk_latency(struct raft_fixture *f, + unsigned i, + unsigned msecs) +{ + struct io *io = f->servers[i]->io.impl; + io->disk_latency = msecs; +} + +void raft_fixture_set_send_latency(struct raft_fixture *f, + unsigned i, + unsigned j, + unsigned msecs) +{ + struct io *io = f->servers[i]->io.impl; + struct peer *peer = ioGetPeer(io, f->servers[j]->id); + peer->send_latency = msecs; +} + +void raft_fixture_set_term(struct raft_fixture *f, unsigned i, raft_term term) +{ + struct io *io = f->servers[i]->io.impl; + io->term = term; +} + +void raft_fixture_set_snapshot(struct raft_fixture *f, + unsigned i, + struct raft_snapshot *snapshot) +{ + struct io *io = f->servers[i]->io.impl; + io->snapshot = snapshot; +} + +void raft_fixture_add_entry(struct raft_fixture *f, + unsigned i, + struct raft_entry *entry) +{ + struct io *io = f->servers[i]->io.impl; + struct raft_entry *entries; + entries = raft_realloc(io->entries, (io->n + 1) * sizeof *entries); + assert(entries != NULL); + entries[io->n] = *entry; + io->entries = entries; + io->n++; +} + +void raft_fixture_append_fault(struct raft_fixture *f, unsigned i, int delay) +{ + struct io *io = f->servers[i]->io.impl; + io->append_fault_countdown = delay; +} + +void raft_fixture_vote_fault(struct raft_fixture *f, unsigned i, int delay) +{ + struct io *io = f->servers[i]->io.impl; + io->vote_fault_countdown = delay; +} + +void raft_fixture_term_fault(struct raft_fixture *f, unsigned i, int delay) +{ + struct io *io = f->servers[i]->io.impl; + io->term_fault_countdown = delay; +} + +void raft_fixture_send_fault(struct raft_fixture *f, unsigned i, int delay) +{ + struct io *io = f->servers[i]->io.impl; + io->send_fault_countdown = delay; +} + +unsigned raft_fixture_n_send(struct raft_fixture *f, unsigned i, int type) +{ + struct io *io = f->servers[i]->io.impl; + return io->n_send[type]; +} + +unsigned raft_fixture_n_recv(struct raft_fixture *f, unsigned i, int type) +{ + struct io *io = f->servers[i]->io.impl; + return io->n_recv[type]; +} + +void raft_fixture_make_unavailable(struct raft_fixture *f, unsigned i) +{ + struct raft *r = &f->servers[i]->raft; + convertToUnavailable(r); +} diff --git a/src/raft/flags.c b/src/raft/flags.c new file mode 100644 index 000000000..7247613ab --- /dev/null +++ b/src/raft/flags.c @@ -0,0 +1,16 @@ +#include "flags.h" + +inline raft_flags flagsSet(raft_flags in, raft_flags flags) +{ + return in | flags; +} + +inline raft_flags flagsClear(raft_flags in, raft_flags flags) +{ + return in & (~flags); +} + +inline bool flagsIsSet(raft_flags in, raft_flags flag) +{ + return (bool)(in & flag); +} diff --git a/src/raft/flags.h b/src/raft/flags.h new file mode 100644 index 000000000..79d2a8428 --- /dev/null +++ b/src/raft/flags.h @@ -0,0 +1,20 @@ +#ifndef FLAGS_H_ +#define FLAGS_H_ + +#include "../raft.h" + +#define RAFT_DEFAULT_FEATURE_FLAGS (0) + +/* Adds the flags @flags to @in and returns the new flags. Multiple flags should + * be combined using the `|` operator. */ +raft_flags flagsSet(raft_flags in, raft_flags flags); + +/* Clears the flags @flags from @in and returns the new flags. Multiple flags + * should be combined using the `|` operator. */ +raft_flags flagsClear(raft_flags in, raft_flags flags); + +/* Returns `true` if the single flag @flag is set in @in, otherwise returns + * `false`. */ +bool flagsIsSet(raft_flags in, raft_flags flag); + +#endif /* FLAGS_H */ diff --git a/src/raft/heap.c b/src/raft/heap.c new file mode 100644 index 000000000..9361cd12b --- /dev/null +++ b/src/raft/heap.c @@ -0,0 +1,121 @@ +#include "heap.h" + +#include + +#include "../raft.h" + +static void *defaultMalloc(void *data, size_t size) +{ + (void)data; + return malloc(size); +} + +static void defaultFree(void *data, void *ptr) +{ + (void)data; + free(ptr); +} + +static void *defaultCalloc(void *data, size_t nmemb, size_t size) +{ + (void)data; + return calloc(nmemb, size); +} + +static void *defaultRealloc(void *data, void *ptr, size_t size) +{ + (void)data; + return realloc(ptr, size); +} + +static void *defaultAlignedAlloc(void *data, size_t alignment, size_t size) +{ + (void)data; + return aligned_alloc(alignment, size); +} + +static void defaultAlignedFree(void *data, size_t alignment, void *ptr) +{ + (void)alignment; + defaultFree(data, ptr); +} + +static struct raft_heap defaultHeap = { + NULL, /* data */ + defaultMalloc, /* malloc */ + defaultFree, /* free */ + defaultCalloc, /* calloc */ + defaultRealloc, /* realloc */ + defaultAlignedAlloc, /* aligned_alloc */ + defaultAlignedFree /* aligned_free */ +}; + +static struct raft_heap *currentHeap = &defaultHeap; + +void *RaftHeapMalloc(size_t size) +{ + return currentHeap->malloc(currentHeap->data, size); +} + +void RaftHeapFree(void *ptr) +{ + if (ptr == NULL) { + return; + } + currentHeap->free(currentHeap->data, ptr); +} + +void *RaftHeapCalloc(size_t nmemb, size_t size) +{ + return currentHeap->calloc(currentHeap->data, nmemb, size); +} + +void *RaftHeapRealloc(void *ptr, size_t size) +{ + return currentHeap->realloc(currentHeap->data, ptr, size); +} + +void *raft_malloc(size_t size) +{ + return RaftHeapMalloc(size); +} + +void raft_free(void *ptr) +{ + RaftHeapFree(ptr); +} + +void *raft_calloc(size_t nmemb, size_t size) +{ + return RaftHeapCalloc(nmemb, size); +} + +void *raft_realloc(void *ptr, size_t size) +{ + return RaftHeapRealloc(ptr, size); +} + +void *raft_aligned_alloc(size_t alignment, size_t size) +{ + return currentHeap->aligned_alloc(currentHeap->data, alignment, size); +} + +void raft_aligned_free(size_t alignment, void *ptr) +{ + currentHeap->aligned_free(currentHeap->data, alignment, ptr); +} + +void raft_heap_set(struct raft_heap *heap) +{ + currentHeap = heap; +} + +void raft_heap_set_default(void) +{ + currentHeap = &defaultHeap; +} + +const struct raft_heap *raft_heap_get(void) +{ + return currentHeap; +} diff --git a/src/raft/heap.h b/src/raft/heap.h new file mode 100644 index 000000000..005b5ea9c --- /dev/null +++ b/src/raft/heap.h @@ -0,0 +1,16 @@ +/* Internal heap APIs. */ + +#ifndef HEAP_H_ +#define HEAP_H_ + +#include + +void *RaftHeapMalloc(size_t size); + +void *RaftHeapCalloc(size_t nmemb, size_t size); + +void *RaftHeapRealloc(void *ptr, size_t size); + +void RaftHeapFree(void *ptr); + +#endif /* HEAP_H_ */ diff --git a/src/raft/lifecycle.c b/src/raft/lifecycle.c new file mode 100644 index 000000000..bd6d618c7 --- /dev/null +++ b/src/raft/lifecycle.c @@ -0,0 +1,36 @@ +#include "lifecycle.h" +#include "../tracing.h" +#include "queue.h" + +#include +#include +#include + +static bool reqIdIsSet(const struct request *req) +{ + return req->req_id[15] == (uint8_t)-1; +} + +static uint64_t extractReqId(const struct request *req) +{ + uint64_t id; + memcpy(&id, &req->req_id, sizeof(id)); + return id; +} + +void lifecycleRequestStart(struct raft *r, struct request *req) +{ + if (reqIdIsSet(req)) { + tracef("request start id:%" PRIu64, extractReqId(req)); + } + QUEUE_PUSH(&r->leader_state.requests, &req->queue); +} + +void lifecycleRequestEnd(struct raft *r, struct request *req) +{ + (void)r; + if (reqIdIsSet(req)) { + tracef("request end id:%" PRIu64, extractReqId(req)); + } + QUEUE_REMOVE(&req->queue); +} diff --git a/src/raft/lifecycle.h b/src/raft/lifecycle.h new file mode 100644 index 000000000..616a260a0 --- /dev/null +++ b/src/raft/lifecycle.h @@ -0,0 +1,10 @@ +#ifndef LIFECYCLE_H_ +#define LIFECYCLE_H_ + +#include "../raft.h" +#include "request.h" + +void lifecycleRequestStart(struct raft *r, struct request *req); +void lifecycleRequestEnd(struct raft *r, struct request *req); + +#endif diff --git a/src/raft/log.c b/src/raft/log.c new file mode 100644 index 000000000..434ad952f --- /dev/null +++ b/src/raft/log.c @@ -0,0 +1,996 @@ +#include "log.h" + +#include + +#include "../raft.h" +#include "assert.h" +#include "configuration.h" + +/* Calculate the reference count hash table key for the given log entry index in + * an hash table of the given size. + * + * The hash is simply the log entry index minus one modulo the size. This + * minimizes conflicts in the most frequent case, where a new log entry is + * simply appended to the log and can use the hash table bucket next to the + * bucket for the entry with the previous index (possibly resizing the table if + * its cap is reached). */ +static size_t refsKey(const raft_index index, const size_t size) +{ + assert(index > 0); + assert(size > 0); + return (size_t)((index - 1) % size); +} + +/* Try to insert a new reference count item for the given log entry index into + * the given reference count hash table. + * + * A collision happens when the bucket associated with the hash key of the given + * log entry index is already used to refcount log entries with a different + * index. In that case the collision output parameter will be set to true and no + * new reference count item is inserted into the hash table. + * + * If two log entries have the same index but different terms, the associated + * bucket will be grown accordingly. */ +static int refsTryInsert(struct raft_entry_ref *table, + const size_t size, + const raft_term term, + const raft_index index, + const unsigned short count, + struct raft_buffer buf, + void *batch, + bool *collision) +{ + struct raft_entry_ref *bucket; /* Bucket associated with this index. */ + struct raft_entry_ref *next_slot; /* For traversing the bucket slots. */ + struct raft_entry_ref + *last_slot; /* To track the last traversed slot. */ + struct raft_entry_ref *slot; /* Actual slot to use for this entry. */ + size_t key; + + assert(table != NULL); + assert(size > 0); + assert(term > 0); + assert(index > 0); + assert(count > 0); + assert(collision != NULL); + + /* Calculate the hash table key for the given index. */ + key = refsKey(index, size); + bucket = &table[key]; + + /* If a bucket is empty, then there's no collision and we can fill its + * first slot. */ + if (bucket->count == 0) { + assert(bucket->next == NULL); + slot = bucket; + goto fill; + } + + /* If the bucket is already used to refcount entries with a different + * index, then we have a collision and we must abort here. */ + if (bucket->index != index) { + *collision = true; + return 0; + } + + /* If we get here it means that the bucket is in use to refcount one or + * more entries with the same index as the given one, but different + * terms. + * + * We must append a newly allocated slot to refcount the entry with this + * term. + * + * So first let's find the last slot in the bucket. */ + for (next_slot = bucket; next_slot != NULL; + next_slot = next_slot->next) { + /* All entries in a bucket must have the same index. */ + assert(next_slot->index == index); + + /* It should never happen that two entries with the same index + * and term get appended. So no existing slot in this bucket + * must track an entry with the same term as the given one. */ + assert(next_slot->term != term); + + last_slot = next_slot; + } + + /* The last slot must have no next slot. */ + assert(last_slot->next == NULL); + + slot = raft_malloc(sizeof *slot); + if (slot == NULL) { + return RAFT_NOMEM; + } + + last_slot->next = slot; + +fill: + slot->term = term; + slot->index = index; + slot->count = count; + slot->buf = buf; + slot->batch = batch; + slot->next = NULL; + + *collision = false; + + return 0; +} + +/* Move the slots of the given bucket into the given reference count hash + * table. The key of the bucket to use in the given table will be re-calculated + * according to the given size. */ +static int refsMove(struct raft_entry_ref *bucket, + struct raft_entry_ref *table, + const size_t size) +{ + struct raft_entry_ref *slot; + struct raft_entry_ref *next_slot; + + assert(bucket != NULL); + assert(table != NULL); + assert(size > 0); + + /* Only non-empty buckets should be moved. */ + assert(bucket->count > 0); + + /* For each slot in the bucket, insert the relevant entry in the given + * table, then free it. */ + next_slot = bucket; + while (next_slot != NULL) { + bool collision; + int rv; + + slot = next_slot; + + /* Insert the reference count for this entry into the new table. + */ + rv = refsTryInsert(table, size, slot->term, slot->index, + slot->count, slot->buf, slot->batch, + &collision); + + next_slot = slot->next; + + /* Unless this is the very first slot in the bucket, we need to + * free the slot. */ + if (slot != bucket) { + raft_free(slot); + } + + if (rv != 0) { + return rv; + } + + /* The given hash table is assumed to be large enough to hold + * all ref counts without any conflict. */ + assert(!collision); + }; + + return 0; +} + +/* Grow the size of the reference count hash table. */ +static int refsGrow(struct raft_log *l) +{ + struct raft_entry_ref *table; /* New hash table. */ + size_t size; /* Size of the new hash table. */ + size_t i; + + assert(l != NULL); + assert(l->refs_size > 0); + + size = l->refs_size * 2; /* Double the table size */ + + table = raft_calloc(size, sizeof *table); + if (table == NULL) { + return RAFT_NOMEM; + } + + /* Populate the new hash table, inserting all entries existing in the + * current hash table. Each bucket will have a different key in the new + * hash table, since the size has changed. */ + for (i = 0; i < l->refs_size; i++) { + struct raft_entry_ref *bucket = &l->refs[i]; + if (bucket->count > 0) { + int rv = refsMove(bucket, table, size); + if (rv != 0) { + return rv; + } + } else { + /* If the count is zero, we expect that the bucket is + * unused. */ + assert(bucket->next == NULL); + } + } + + raft_free(l->refs); + + l->refs = table; + l->refs_size = size; + + return 0; +} + +/* Initialize the reference count of the entry with the given index, setting it + * to 1. */ +static int refsInit(struct raft_log *l, + const raft_term term, + const raft_index index, + struct raft_buffer buf, + void *batch) +{ + int i; + + assert(l != NULL); + assert(term > 0); + assert(index > 0); + + /* Initialize the hash map with a reasonable size */ + if (l->refs == NULL) { + l->refs_size = LOG__REFS_INITIAL_SIZE; + l->refs = raft_calloc(l->refs_size, sizeof *l->refs); + if (l->refs == NULL) { + return RAFT_NOMEM; + } + } + + /* Check if the bucket associated with the given index is available + * (i.e. there are no collisions), or grow the table and re-key it + * otherwise. + * + * We limit the number of times we try to grow the table to 10, to avoid + * eating up too much memory. In practice, there should never be a case + * where this is not enough. */ + for (i = 0; i < 10; i++) { + bool collision; + int rc; + + rc = refsTryInsert(l->refs, l->refs_size, term, index, 1, buf, + batch, &collision); + if (rc != 0) { + return RAFT_NOMEM; + } + + if (!collision) { + return 0; + } + + rc = refsGrow(l); + if (rc != 0) { + return rc; + } + }; + + return RAFT_NOMEM; +} + +/* Increment the refcount of the entry with the given term and index. */ +static void refsIncr(struct raft_log *l, + const raft_term term, + const raft_index index) +{ + size_t key; /* Hash table key for the given index. */ + struct raft_entry_ref *slot; /* Slot for the given term/index */ + + assert(l != NULL); + assert(term > 0); + assert(index > 0); + + key = refsKey(index, l->refs_size); + + /* Lookup the slot associated with the given term/index, which must have + * been previously inserted. */ + slot = &l->refs[key]; + while (1) { + assert(slot != NULL); + assert(slot->index == index); + if (slot->term == term) { + break; + } + slot = slot->next; + } + assert(slot != NULL); + + slot->count++; +} + +/* Decrement the refcount of the entry with the given index. Return a boolean + * indicating whether the entry has now zero references. */ +static bool refsDecr(struct raft_log *l, + const raft_term term, + const raft_index index) +{ + size_t key; /* Hash table key for the given index. */ + struct raft_entry_ref *slot; /* Slot for the given term/index */ + struct raft_entry_ref + *prev_slot; /* Slot preceeding the one to decrement */ + + assert(l != NULL); + assert(term > 0); + assert(index > 0); + + key = refsKey(index, l->refs_size); + prev_slot = NULL; + + /* Lookup the slot associated with the given term/index, keeping track + * of its previous slot in the bucket list. */ + slot = &l->refs[key]; + while (1) { + assert(slot != NULL); + assert(slot->index == index); + if (slot->term == term) { + break; + } + prev_slot = slot; + slot = slot->next; + } + + slot->count--; + + if (slot->count > 0) { + /* The entry is still referenced. */ + return false; + } + + /* If the refcount has dropped to zero, delete the slot. */ + if (prev_slot != NULL) { + /* This isn't the very first slot, simply unlink it from the + * slot list. */ + prev_slot->next = slot->next; + raft_free(slot); + } else if (slot->next != NULL) { + /* This is the very first slot, and slot list is not empty. Copy + * the second slot into the first one, then delete it. */ + struct raft_entry_ref *second_slot = slot->next; + *slot = *second_slot; + raft_free(second_slot); + } + + return true; +} + +struct raft_log *logInit(void) +{ + struct raft_log *log; + + log = raft_malloc(sizeof(*log)); + if (log == NULL) { + return NULL; + } + + log->entries = NULL; + log->size = 0; + log->front = log->back = 0; + log->offset = 0; + log->refs = NULL; + log->refs_size = 0; + log->snapshot.last_index = 0; + log->snapshot.last_term = 0; + + return log; +} + +/* Return the index of the i'th entry in the log. */ +static raft_index indexAt(struct raft_log *l, size_t i) +{ + return l->offset + i + 1; +} + +/* Return the circular buffer position of the i'th entry in the log. */ +static size_t positionAt(struct raft_log *l, size_t i) +{ + return (l->front + i) % l->size; +} + +/* Return the i'th entry in the log. */ +static struct raft_entry *entryAt(struct raft_log *l, size_t i) +{ + return &l->entries[positionAt(l, i)]; +} + +void logClose(struct raft_log *l) +{ + void *batch = NULL; /* Last batch that has been freed */ + + assert(l != NULL); + + if (l->entries != NULL) { + size_t i; + size_t n = logNumEntries(l); + + for (i = 0; i < n; i++) { + struct raft_entry *entry = entryAt(l, i); + raft_index index = indexAt(l, i); + size_t key = refsKey(index, l->refs_size); + struct raft_entry_ref *slot = &l->refs[key]; + + /* We require that there are no outstanding references + * to active entries. */ + assert(slot->count == 1); + + /* TODO: we should support the case where the bucket has + * more than one slot. */ + assert(slot->next == NULL); + + /* Release the memory used by the entry data (either + * directly or via a batch). */ + if (entry->batch == NULL) { + if (entry->buf.base != NULL) { + raft_free(entry->buf.base); + } + } else { + if (entry->batch != batch) { + /* This batch was not released yet, so + * let's do it now. */ + batch = entry->batch; + raft_free(entry->batch); + } + } + } + + raft_free(l->entries); + } + + if (l->refs != NULL) { + raft_free(l->refs); + } + + raft_free(l); +} + +void logStart(struct raft_log *l, + raft_index snapshot_index, + raft_term snapshot_term, + raft_index start_index) +{ + assert(logNumEntries(l) == 0); + assert(start_index > 0); + assert(start_index <= snapshot_index + 1); + assert(snapshot_index == 0 || snapshot_term != 0); + l->snapshot.last_index = snapshot_index; + l->snapshot.last_term = snapshot_term; + l->offset = start_index - 1; +} + +/* Ensure that the entries array has enough free slots for adding a new entry. + */ +static int ensureCapacity(struct raft_log *l) +{ + struct raft_entry *entries; /* New entries array */ + size_t n; /* Current number of entries */ + size_t size; /* Size of the new array */ + size_t i; + + n = logNumEntries(l); + + if (n + 1 < l->size) { + return 0; + } + + /* Make the new size twice the current size plus one (for the new + * entry). Over-allocating now avoids smaller allocations later. */ + size = (l->size + 1) * 2; + + entries = raft_calloc(size, sizeof *entries); + if (entries == NULL) { + return RAFT_NOMEM; + } + + /* Copy all active old entries to the beginning of the newly allocated + * array. */ + for (i = 0; i < n; i++) { + memcpy(&entries[i], entryAt(l, i), sizeof *entries); + } + + /* Release the old entries array. */ + if (l->entries != NULL) { + raft_free(l->entries); + } + + l->entries = entries; + l->size = size; + l->front = 0; + l->back = n; + + return 0; +} + +int logReinstate(struct raft_log *l, + raft_term term, + unsigned short type, + bool *reinstated) +{ + raft_index index; + size_t key; + struct raft_entry_ref *bucket; + struct raft_entry_ref *slot; + struct raft_entry *entry; + int rv; + + *reinstated = false; + + if (l->refs_size == 0) { + return 0; + } + + index = logLastIndex(l) + 1; + key = refsKey(index, l->refs_size); + bucket = &l->refs[key]; + if (bucket->count == 0 || bucket->index != index) { + return 0; + } + + for (slot = bucket; slot != NULL; slot = slot->next) { + if (slot->term == term) { + rv = ensureCapacity(l); + if (rv != 0) { + return rv; + } + slot->count++; + l->back++; + l->back %= l->size; + entry = &l->entries[l->back]; + entry->term = term; + entry->type = type; + entry->buf = slot->buf; + entry->batch = slot->batch; + *reinstated = true; + break; + } + } + + return 0; +} + +int logAppend(struct raft_log *l, + const raft_term term, + const unsigned short type, + const struct raft_buffer *buf, + void *batch) +{ + int rv; + struct raft_entry *entry; + raft_index index; + + assert(l != NULL); + assert(term > 0); + assert(type == RAFT_CHANGE || type == RAFT_BARRIER || + type == RAFT_COMMAND); + assert(buf != NULL); + + rv = ensureCapacity(l); + if (rv != 0) { + return rv; + } + + index = logLastIndex(l) + 1; + + rv = refsInit(l, term, index, *buf, batch); + if (rv != 0) { + return rv; + } + + entry = &l->entries[l->back]; + entry->term = term; + entry->type = type; + entry->buf = *buf; + entry->batch = batch; + + l->back += 1; + l->back = l->back % l->size; + + return 0; +} + +int logAppendCommands(struct raft_log *l, + const raft_term term, + const struct raft_buffer bufs[], + const unsigned n) +{ + unsigned i; + int rv; + + assert(l != NULL); + assert(term > 0); + assert(bufs != NULL); + assert(n > 0); + + for (i = 0; i < n; i++) { + const struct raft_buffer *buf = &bufs[i]; + rv = logAppend(l, term, RAFT_COMMAND, buf, NULL); + if (rv != 0) { + return rv; + } + } + + return 0; +} + +int logAppendConfiguration(struct raft_log *l, + const raft_term term, + const struct raft_configuration *configuration) +{ + struct raft_buffer buf; + int rv; + + assert(l != NULL); + assert(term > 0); + assert(configuration != NULL); + + /* Encode the configuration into a buffer. */ + rv = configurationEncode(configuration, &buf); + if (rv != 0) { + goto err; + } + + /* Append the new entry to the log. */ + rv = logAppend(l, term, RAFT_CHANGE, &buf, NULL); + if (rv != 0) { + goto err_after_encode; + } + + return 0; + +err_after_encode: + raft_free(buf.base); + +err: + assert(rv != 0); + return rv; +} + +size_t logNumEntries(struct raft_log *l) +{ + assert(l != NULL); + + /* The circular buffer is not wrapped. */ + if (l->front <= l->back) { + return l->back - l->front; + } + + /* The circular buffer is wrapped. */ + return l->size - l->front + l->back; +} + +raft_index logLastIndex(struct raft_log *l) +{ + /* If there are no entries in the log, but there is a snapshot available + * check that it's last index is consistent with the offset. */ + if (logNumEntries(l) == 0 && l->snapshot.last_index != 0) { + assert(l->offset <= l->snapshot.last_index); + } + return l->offset + logNumEntries(l); +} + +/* Return the position of the entry with the given index in the entries array. + * + * If no entry with the given index is in the log return the size of the entries + * array. */ +static size_t locateEntry(struct raft_log *l, const raft_index index) +{ + size_t n = logNumEntries(l); + + if (n == 0 || index < indexAt(l, 0) || index > indexAt(l, n - 1)) { + return l->size; + } + + /* Get the circular buffer position of the desired entry. Log indexes + * start at 1, so we subtract one to get array indexes. We also need to + * subtract any index offset this log might start at. */ + return positionAt(l, (size_t)((index - 1) - l->offset)); +} + +raft_term logTermOf(struct raft_log *l, const raft_index index) +{ + size_t i; + assert(index > 0); + assert(l->offset <= l->snapshot.last_index); + + if ((index < l->offset + 1 && index != l->snapshot.last_index) || + index > logLastIndex(l)) { + return 0; + } + + if (index == l->snapshot.last_index) { + assert(l->snapshot.last_term != 0); + /* Coherence check that if we still have the entry at + * last_index, its term matches the one in the snapshot. */ + i = locateEntry(l, index); + if (i != l->size) { + assert(l->entries[i].term == l->snapshot.last_term); + } + return l->snapshot.last_term; + } + + i = locateEntry(l, index); + assert(i < l->size); + return l->entries[i].term; +} + +raft_index logSnapshotIndex(struct raft_log *l) +{ + return l->snapshot.last_index; +} + +raft_term logLastTerm(struct raft_log *l) +{ + raft_index last_index; + last_index = logLastIndex(l); + return last_index > 0 ? logTermOf(l, last_index) : 0; +} + +const struct raft_entry *logGet(struct raft_log *l, const raft_index index) +{ + size_t i; + + assert(l != NULL); + + /* Get the array index of the desired entry. */ + i = locateEntry(l, index); + if (i == l->size) { + return NULL; + } + + assert(i < l->size); + + return &l->entries[i]; +} + +int logAcquire(struct raft_log *l, + const raft_index index, + struct raft_entry *entries[], + unsigned *n) +{ + size_t i; + size_t j; + + assert(l != NULL); + assert(index > 0); + assert(entries != NULL); + assert(n != NULL); + + /* Get the array index of the first entry to acquire. */ + i = locateEntry(l, index); + + if (i == l->size) { + *n = 0; + *entries = NULL; + return 0; + } + + if (i < l->back) { + /* The last entry does not wrap with respect to i, so the number + * of entries is simply the length of the range [i...l->back). + */ + *n = (unsigned)(l->back - i); + } else { + /* The last entry wraps with respect to i, so the number of + * entries is the sum of the lengths of the ranges [i...l->size) + * and [0...l->back), which is l->size - i + l->back.*/ + *n = (unsigned)(l->size - i + l->back); + } + + assert(*n > 0); + + *entries = raft_calloc(*n, sizeof **entries); + if (*entries == NULL) { + return RAFT_NOMEM; + } + + for (j = 0; j < *n; j++) { + size_t k = (i + j) % l->size; + struct raft_entry *entry = &(*entries)[j]; + *entry = l->entries[k]; + refsIncr(l, entry->term, index + j); + } + + return 0; +} + +/* Return true if the given batch is referenced by any entry currently in the + * log. */ +static bool isBatchReferenced(struct raft_log *l, const void *batch) +{ + size_t i; + + /* Iterate through all live entries to see if there's one + * belonging to the same batch. This is slightly inefficient but + * this code path should be taken very rarely in practice. */ + for (i = 0; i < logNumEntries(l); i++) { + struct raft_entry *entry = entryAt(l, i); + if (entry->batch == batch) { + return true; + } + } + + return false; +} + +void logRelease(struct raft_log *l, + const raft_index index, + struct raft_entry entries[], + const unsigned n) +{ + size_t i; + void *batch = NULL; /* Last batch whose memory was freed */ + + assert(l != NULL); + assert((entries == NULL && n == 0) || (entries != NULL && n > 0)); + + for (i = 0; i < n; i++) { + struct raft_entry *entry = &entries[i]; + bool unref; + + unref = refsDecr(l, entry->term, index + i); + + /* If there are no outstanding references to this entry, free + * its payload if it's not part of a batch, or check if we can + * free the batch itself. */ + if (unref) { + if (entries[i].batch == NULL) { + if (entry->buf.base != NULL) { + raft_free(entries[i].buf.base); + } + } else { + if (entry->batch != batch) { + if (!isBatchReferenced(l, + entry->batch)) { + batch = entry->batch; + raft_free(batch); + } + } + } + } + } + + if (entries != NULL) { + raft_free(entries); + } +} + +/* Clear the log if it became empty. */ +static void clearIfEmpty(struct raft_log *l) +{ + if (logNumEntries(l) > 0) { + return; + } + raft_free(l->entries); + l->entries = NULL; + l->size = 0; + l->front = 0; + l->back = 0; +} + +/* Destroy an entry, possibly releasing the memory of its buffer. */ +static void destroyEntry(struct raft_log *l, struct raft_entry *entry) +{ + if (entry->batch == NULL) { + if (entry->buf.base != NULL) { + raft_free(entry->buf.base); + } + } else { + if (!isBatchReferenced(l, entry->batch)) { + raft_free(entry->batch); + } + } +} + +/* Core logic of @logTruncate and @logDiscard, removing all log entries from + * @index onward. If @destroy is true, also destroy the removed entries. */ +static void removeSuffix(struct raft_log *l, + const raft_index index, + bool destroy) +{ + size_t i; + size_t n; + raft_index start = index; + + assert(l != NULL); + assert(index > l->offset); + assert(index <= logLastIndex(l)); + + /* Number of entries to delete */ + n = (size_t)(logLastIndex(l) - start) + 1; + + for (i = 0; i < n; i++) { + struct raft_entry *entry; + bool unref; + + if (l->back == 0) { + l->back = l->size - 1; + } else { + l->back--; + } + + entry = &l->entries[l->back]; + unref = refsDecr(l, entry->term, start + n - i - 1); + + if (unref && destroy) { + destroyEntry(l, entry); + } + } + + clearIfEmpty(l); +} + +void logTruncate(struct raft_log *l, const raft_index index) +{ + if (logNumEntries(l) == 0) { + return; + } + removeSuffix(l, index, true); +} + +void logDiscard(struct raft_log *l, const raft_index index) +{ + removeSuffix(l, index, false); +} + +/* Delete all entries up to the given index (included). */ +static void removePrefix(struct raft_log *l, const raft_index index) +{ + size_t i; + size_t n; + + assert(l != NULL); + assert(index > 0); + assert(index <= logLastIndex(l)); + + /* Number of entries to delete */ + n = (size_t)(index - indexAt(l, 0)) + 1; + + for (i = 0; i < n; i++) { + struct raft_entry *entry; + bool unref; + + entry = &l->entries[l->front]; + + if (l->front == l->size - 1) { + l->front = 0; + } else { + l->front++; + } + l->offset++; + + unref = refsDecr(l, entry->term, l->offset); + + if (unref) { + destroyEntry(l, entry); + } + } + + clearIfEmpty(l); +} + +void logSnapshot(struct raft_log *l, raft_index last_index, unsigned trailing) +{ + raft_term last_term = logTermOf(l, last_index); + + /* We must have an entry at this index */ + assert(last_term != 0); + + l->snapshot.last_index = last_index; + l->snapshot.last_term = last_term; + + /* If we have not at least n entries preceeding the given last index, + * then there's nothing to remove and we're done. */ + if (last_index <= trailing || + locateEntry(l, last_index - trailing) == l->size) { + return; + } + + removePrefix(l, last_index - trailing); +} + +void logRestore(struct raft_log *l, raft_index last_index, raft_term last_term) +{ + size_t n = logNumEntries(l); + assert(last_index > 0); + assert(last_term > 0); + if (n > 0) { + logTruncate(l, logLastIndex(l) - n + 1); + } + l->snapshot.last_index = last_index; + l->snapshot.last_term = last_term; + l->offset = last_index; +} diff --git a/src/raft/log.h b/src/raft/log.h new file mode 100644 index 000000000..46707d2f5 --- /dev/null +++ b/src/raft/log.h @@ -0,0 +1,169 @@ +/* In-memory cache of the persistent raft log stored on disk. */ + +#ifndef RAFT_LOG_H_ +#define RAFT_LOG_H_ + +#include "../raft.h" + +/* Initial size of the entry reference count hash table. */ +#define LOG__REFS_INITIAL_SIZE 256 + +/** + * Counter for outstanding references to a log entry. + * + * When an entry is first appended to the log, its refcount is set to one (the + * log itself is the only one referencing the entry). Whenever an entry is + * included in an I/O request (to write it to disk or to send it to other + * servers) its refcount is increased by one. Whenever an entry gets deleted + * from the log its refcount is decreased by one. Likewise, whenever an I/O + * request is completed the refcount of the relevant entries is decreased by + * one. When the refcount drops to zero the memory that its @buf attribute + * points to gets released, or, if the @batch attribute is non-NULL, a check is + * made to see if all other entries of the same batch also have a zero refcount, + * and the memory that @batch points to gets released if that's the case. + */ +struct raft_entry_ref +{ + raft_term term; /* Term of the entry being ref-counted. */ + raft_index index; /* Index of the entry being ref-counted. */ + unsigned short count; /* Number of references. */ + /* The next two fields are copied from the corresponding fields of the + * raft_entry pointed to by this reference. We store them here as well, + * so that logReinstate can retrieve them when it finds a raft_entry_ref + * with the same index and term as it was passed, and create a full + * raft_entry using them. */ + struct raft_buffer buf; + void *batch; + struct raft_entry_ref + *next; /* Next item in the bucket (for collisions). */ +}; + +/** + * In-memory cache of the persistent raft log stored on disk. + * + * The raft log cache is implemented as a circular buffer of log entries, which + * makes some frequent operations very efficient (e.g. deleting the first N + * entries when snapshotting). + */ +struct raft_log +{ + struct raft_entry *entries; /* Circular buffer of log entries. */ + size_t size; /* Number of available slots in the buffer. */ + size_t front, back; /* Indexes of used slots [front, back). */ + raft_index offset; /* Index of first entry is offset+1. */ + struct raft_entry_ref + *refs; /* Log entries reference counts hash table. */ + size_t refs_size; /* Size of the reference counts hash table. */ + struct /* Information about last snapshot, or zero. */ + { + raft_index + last_index; /* Snapshot replaces all entries up to here. */ + raft_term last_term; /* Term of last index. */ + } snapshot; +}; + +/* Initialize an empty in-memory log of raft entries. */ +struct raft_log *logInit(void); + +/* Release all memory used by the given log object. */ +void logClose(struct raft_log *l); + +/* Called at startup when populating the log with entries loaded from disk. It + * sets the starting state of the log. The start index must be lower or equal + * than snapshot_index + 1. */ +void logStart(struct raft_log *l, + raft_index snapshot_index, + raft_term snapshot_term, + raft_index start_index); + +/* Get the number of entries the log currently contains. */ +size_t logNumEntries(struct raft_log *l); + +/* Get the index of the last entry in the log. Return #0 if the log is empty. */ +raft_index logLastIndex(struct raft_log *l); + +/* Get the term of the last entry in the log. Return #0 if the log is empty. */ +raft_term logLastTerm(struct raft_log *l); + +/* Get the term of the entry with the given index. Return #0 if @index is * + * greater than the last index of the log, or if it's lower than oldest index we + * know the term of (either because it's outstanding or because it's the last + * entry in the most recent snapshot). */ +raft_term logTermOf(struct raft_log *l, raft_index index); + +/* Get the index of the last entry in the most recent snapshot. Return #0 if + * there are no snapshots. */ +raft_index logSnapshotIndex(struct raft_log *l); + +/* Get the entry with the given index. The returned pointer remains valid only + * as long as no API that might delete the entry with the given index is + * invoked. Return #NULL if there is no such entry. */ +const struct raft_entry *logGet(struct raft_log *l, const raft_index index); + +/* Check whether the hash map is already tracking an entry with the given + * @term and @index (that is not part of the "logical" log). If so, increment + * the refcount of that entry and set @reinstated to true; otherwise, set + * @reinstated to false. */ +int logReinstate(struct raft_log *l, + raft_term term, + unsigned short type, + bool *reinstated); + +/* Append a new entry to the log. */ +int logAppend(struct raft_log *l, + raft_term term, + unsigned short type, + const struct raft_buffer *buf, + void *batch); + +/* Convenience to append a series of #RAFT_COMMAND entries. */ +int logAppendCommands(struct raft_log *l, + const raft_term term, + const struct raft_buffer bufs[], + const unsigned n); + +/* Convenience to encode and append a single #RAFT_CHANGE entry. */ +int logAppendConfiguration(struct raft_log *l, + const raft_term term, + const struct raft_configuration *configuration); + +/* Acquire an array of entries from the given index onwards. The payload + * memory referenced by the @buf attribute of the returned entries is guaranteed + * to be valid until logRelease() is called. */ +int logAcquire(struct raft_log *l, + raft_index index, + struct raft_entry *entries[], + unsigned *n); + +/* Release a previously acquired array of entries. */ +void logRelease(struct raft_log *l, + raft_index index, + struct raft_entry entries[], + unsigned n); + +/* Delete all entries from the given index (included) onwards. If the log is + * empty this is a no-op. If @index is lower than or equal to the index of the + * first entry in the log, then the log will become empty. */ +void logTruncate(struct raft_log *l, const raft_index index); + +/* Discard all entries from the given index (included) onwards. This is exactly + * the same as truncate, but the memory of the entries does not gets + * released. This is called as part of error handling, when reverting the effect + * of previous logAppend calls. */ +void logDiscard(struct raft_log *l, const raft_index index); + +/* To be called when taking a new snapshot. The log must contain an entry at + * last_index, which is the index of the last entry included in the + * snapshot. The function will update the last snapshot information and delete + * all entries up to last_index - trailing (included). If the log contains no + * entry at last_index - trailing, then no entry will be deleted. */ +void logSnapshot(struct raft_log *l, raft_index last_index, unsigned trailing); + +/* To be called when installing a snapshot. + * + * The log can be in any state. All outstanding entries will be discarded, the + * last index and last term of the most recent snapshot will be set to the given + * values, and the offset adjusted accordingly. */ +void logRestore(struct raft_log *l, raft_index last_index, raft_term last_term); + +#endif /* RAFT_LOG_H_ */ diff --git a/src/raft/membership.c b/src/raft/membership.c new file mode 100644 index 000000000..f810c7722 --- /dev/null +++ b/src/raft/membership.c @@ -0,0 +1,279 @@ +#include "membership.h" + +#include "../raft.h" +#include "../tracing.h" +#include "assert.h" +#include "configuration.h" +#include "err.h" +#include "heap.h" +#include "log.h" +#include "progress.h" + +int membershipCanChangeConfiguration(struct raft *r) +{ + int rv; + + if (r->state != RAFT_LEADER || r->transfer != NULL) { + tracef("NOT LEADER"); + rv = RAFT_NOTLEADER; + goto err; + } + + if (r->configuration_uncommitted_index != 0) { + tracef("r->configuration_uncommitted_index %llu", + r->configuration_uncommitted_index); + rv = RAFT_CANTCHANGE; + goto err; + } + + if (r->leader_state.promotee_id != 0) { + tracef("r->leader_state.promotee_id %llu", + r->leader_state.promotee_id); + rv = RAFT_CANTCHANGE; + goto err; + } + + /* In order to become leader at all we are supposed to have committed at + * least the initial configuration at index 1. */ + assert(r->configuration_committed_index > 0); + + /* The index of the last committed configuration can't be greater than + * the last log index. */ + assert(logLastIndex(r->log) >= r->configuration_committed_index); + + /* No catch-up round should be in progress. */ + assert(r->leader_state.round_number == 0); + assert(r->leader_state.round_index == 0); + assert(r->leader_state.round_start == 0); + + return 0; + +err: + assert(rv != 0); + ErrMsgFromCode(r->errmsg, rv); + return rv; +} + +int membershipFetchLastCommittedConfiguration(struct raft *r, + struct raft_configuration *conf) +{ + const struct raft_entry *entry; + int rv; + + /* Try to get the entry at r->configuration_committed_index from the + * log. If the entry is not present in the log anymore because the log + * was truncated after a snapshot, we can just use + * configuration_last_snapshot, which we cached when we took or restored + * the snapshot and is guaranteed to match the content that the entry at + * r->configuration_committed_index had. */ + entry = logGet(r->log, r->configuration_committed_index); + if (entry != NULL) { + rv = configurationDecode(&entry->buf, conf); + } else { + assert(r->configuration_last_snapshot.n > 0); + rv = configurationCopy(&r->configuration_last_snapshot, conf); + } + if (rv != 0) { + return rv; + } + + return 0; +} + +bool membershipUpdateCatchUpRound(struct raft *r) +{ + unsigned server_index; + raft_index match_index; + raft_index last_index; + raft_time now = r->io->time(r->io); + raft_time round_duration; + bool is_up_to_date; + bool is_fast_enough; + + assert(r->state == RAFT_LEADER); + assert(r->leader_state.promotee_id != 0); + + server_index = configurationIndexOf(&r->configuration, + r->leader_state.promotee_id); + assert(server_index < r->configuration.n); + + match_index = progressMatchIndex(r, server_index); + + /* If the server did not reach the target index for this round, it did + * not catch up. */ + if (match_index < r->leader_state.round_index) { + tracef( + "member (index: %u) not yet caught up match_index:%llu " + "round_index:%llu", + server_index, match_index, r->leader_state.round_index); + return false; + } + + last_index = logLastIndex(r->log); + round_duration = now - r->leader_state.round_start; + + is_up_to_date = match_index == last_index; + is_fast_enough = round_duration < r->election_timeout; + + tracef("member is_up_to_date:%d is_fast_enough:%d", is_up_to_date, + is_fast_enough); + + /* If the server's log is fully up-to-date or the round that just + * terminated was fast enough, then the server as caught up. */ + if (is_up_to_date || is_fast_enough) { + r->leader_state.round_number = 0; + r->leader_state.round_index = 0; + r->leader_state.round_start = 0; + + return true; + } + + /* If we get here it means that this catch-up round is complete, but + * there are more entries to replicate, or it was not fast enough. Let's + * start a new round. */ + r->leader_state.round_number++; + r->leader_state.round_index = last_index; + r->leader_state.round_start = now; + + return false; +} + +int membershipUncommittedChange(struct raft *r, + const raft_index index, + const struct raft_entry *entry) +{ + struct raft_configuration configuration; + int rv; + char msg[128]; + + assert(r != NULL); + assert(r->state == RAFT_FOLLOWER); + assert(entry != NULL); + assert(entry->type == RAFT_CHANGE); + + rv = configurationDecode(&entry->buf, &configuration); + if (rv != 0) { + tracef("failed to decode configuration at index:%llu", index); + goto err; + } + + /* ignore errors */ + snprintf(msg, sizeof(msg), "uncommitted config change at index:%llu", + index); + configurationTrace(r, &configuration, msg); + + raft_configuration_close(&r->configuration); + + r->configuration = configuration; + r->configuration_uncommitted_index = index; + + return 0; + +err: + assert(rv != 0); + return rv; +} + +int membershipRollback(struct raft *r) +{ + int rv; + + assert(r != NULL); + assert(r->state == RAFT_FOLLOWER); + assert(r->configuration_uncommitted_index > 0); + tracef("roll back membership"); + + /* Fetch the last committed configuration entry. */ + assert(r->configuration_committed_index != 0); + + /* Replace the current configuration with the last committed one. */ + configurationClose(&r->configuration); + rv = membershipFetchLastCommittedConfiguration(r, &r->configuration); + if (rv != 0) { + return rv; + } + + configurationTrace(r, &r->configuration, "roll back config"); + r->configuration_uncommitted_index = 0; + return 0; +} + +void membershipLeadershipTransferInit(struct raft *r, + struct raft_transfer *req, + raft_id id, + raft_transfer_cb cb) +{ + req->cb = cb; + req->id = id; + req->start = r->io->time(r->io); + req->send.data = NULL; + r->transfer = req; +} + +static void membershipLeadershipSendCb(struct raft_io_send *send, int status) +{ + (void)status; + RaftHeapFree(send); +} + +int membershipLeadershipTransferStart(struct raft *r) +{ + const struct raft_server *server; + struct raft_message message; + struct raft_io_send *send; + int rv; + assert(r->transfer->send.data == NULL); + server = configurationGet(&r->configuration, r->transfer->id); + assert(server != NULL); + if (server == NULL) { + tracef("transferee server not found in configuration"); + return -1; + } + + /* Don't use the raft_io_send object embedded in struct raft_transfer, + * since the two objects must have different lifetimes. For example + * raft_io_send might live longer than raft_transfer, see #396. + * + * Ideally we should remove the embedded struct raft_io_send send field + * from struct raft_transfer, and replace it with a raft_io_send *send + * pointer, that we set to the raft_io_send object allocated in this + * function. This would break ABI compatibility though. */ + send = RaftHeapMalloc(sizeof *send); + if (send == NULL) { + return RAFT_NOMEM; + } + + message.type = RAFT_IO_TIMEOUT_NOW; + message.server_id = server->id; + message.server_address = server->address; + message.timeout_now.term = r->current_term; + message.timeout_now.last_log_index = logLastIndex(r->log); + message.timeout_now.last_log_term = logLastTerm(r->log); + + /* Set the data attribute of the raft_io_send object embedded in + * raft_transfer. This is needed because we historically used it as a + * flag to indicate that a transfer request was sent. See the + * replicationUpdate function. */ + r->transfer->send.data = r; + + send->data = r; + + rv = r->io->send(r->io, send, &message, membershipLeadershipSendCb); + if (rv != 0) { + RaftHeapFree(send); + ErrMsgTransferf(r->io->errmsg, r->errmsg, + "send timeout now to %llu", server->id); + return rv; + } + return 0; +} + +void membershipLeadershipTransferClose(struct raft *r) +{ + struct raft_transfer *req = r->transfer; + raft_transfer_cb cb = req->cb; + r->transfer = NULL; + if (cb != NULL) { + cb(req); + } +} diff --git a/src/raft/membership.h b/src/raft/membership.h new file mode 100644 index 000000000..15769c9cd --- /dev/null +++ b/src/raft/membership.h @@ -0,0 +1,59 @@ +/* Membership-related APIs. */ + +#ifndef MEMBERSHIP_H_ +#define MEMBERSHIP_H_ + +#include "../raft.h" + +/* Helper returning an error if the configuration can't be changed, either + * because this node is not the leader or because a configuration change is + * already in progress. */ +int membershipCanChangeConfiguration(struct raft *r); + +/* Populate the given configuration object with the most recent committed + * configuration, the one contained in the entry at + * r->configuration_committed_index. */ +int membershipFetchLastCommittedConfiguration(struct raft *r, + struct raft_configuration *conf); + +/* Update the information about the progress that the non-voting server + * currently being promoted is making in catching with logs. + * + * Return false if the server being promoted did not yet catch-up with logs, and + * true if it did. + * + * This function must be called only by leaders after a @raft_assign request + * has been submitted. */ +bool membershipUpdateCatchUpRound(struct raft *r); + +/* Update the local configuration replacing it with the content of the given + * RAFT_CHANGE entry, which has just been received in as part of an + * AppendEntries RPC request. The uncommitted configuration index will be + * updated accordingly. + * + * It must be called only by followers. */ +int membershipUncommittedChange(struct raft *r, + const raft_index index, + const struct raft_entry *entry); + +/* Rollback any promotion configuration change that was applied locally, but + * failed to be committed. It must be called by followers after they receive an + * AppendEntries RPC request that instructs them to evict the uncommitted entry + * from their log. */ +int membershipRollback(struct raft *r); + +/* Initialize the state of a leadership transfer request. */ +void membershipLeadershipTransferInit(struct raft *r, + struct raft_transfer *req, + raft_id id, + raft_transfer_cb cb); + +/* Start the leadership transfer by sending a TimeoutNow message to the target + * server. */ +int membershipLeadershipTransferStart(struct raft *r); + +/* Finish a leadership transfer (whether successful or not), resetting the + * leadership transfer state and firing the user callback. */ +void membershipLeadershipTransferClose(struct raft *r); + +#endif /* MEMBERSHIP_H_ */ diff --git a/src/raft/progress.c b/src/raft/progress.c new file mode 100644 index 000000000..696134c70 --- /dev/null +++ b/src/raft/progress.c @@ -0,0 +1,325 @@ +#include "progress.h" + +#include "../tracing.h" +#include "assert.h" +#include "configuration.h" +#include "log.h" + +#ifndef max +#define max(a, b) ((a) < (b) ? (b) : (a)) +#endif + +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif + +/* Initialize a single progress object. */ +static void initProgress(struct raft_progress *p, raft_index last_index) +{ + p->next_index = last_index + 1; + p->match_index = 0; + p->snapshot_index = 0; + p->last_send = 0; + p->snapshot_last_send = 0; + p->recent_recv = false; + p->state = PROGRESS__PROBE; + p->features = 0; +} + +int progressBuildArray(struct raft *r) +{ + struct raft_progress *progress; + unsigned i; + raft_index last_index = logLastIndex(r->log); + progress = raft_malloc(r->configuration.n * sizeof *progress); + if (progress == NULL) { + return RAFT_NOMEM; + } + for (i = 0; i < r->configuration.n; i++) { + initProgress(&progress[i], last_index); + if (r->configuration.servers[i].id == r->id) { + progress[i].match_index = r->last_stored; + } + } + r->leader_state.progress = progress; + return 0; +} + +int progressRebuildArray(struct raft *r, + const struct raft_configuration *configuration) +{ + raft_index last_index = logLastIndex(r->log); + struct raft_progress *progress; + unsigned i; + unsigned j; + raft_id id; + + progress = raft_malloc(configuration->n * sizeof *progress); + if (progress == NULL) { + return RAFT_NOMEM; + } + + /* First copy the progress information for the servers that exists both + * in the current and in the new configuration. */ + for (i = 0; i < r->configuration.n; i++) { + id = r->configuration.servers[i].id; + j = configurationIndexOf(configuration, id); + if (j == configuration->n) { + /* This server is not present in the new configuration, + * so we just skip it. */ + continue; + } + progress[j] = r->leader_state.progress[i]; + } + + /* Then reset the replication state for servers that are present in the + * new configuration, but not in the current one. */ + for (i = 0; i < configuration->n; i++) { + id = configuration->servers[i].id; + j = configurationIndexOf(&r->configuration, id); + if (j < r->configuration.n) { + /* This server is present both in the new and in the + * current configuration, so we have already copied its + * next/match index value in the loop above. */ + continue; + } + assert(j == r->configuration.n); + initProgress(&progress[i], last_index); + } + + raft_free(r->leader_state.progress); + r->leader_state.progress = progress; + + return 0; +} + +bool progressIsUpToDate(struct raft *r, unsigned i) +{ + struct raft_progress *p = &r->leader_state.progress[i]; + raft_index last_index = logLastIndex(r->log); + return p->next_index == last_index + 1; +} + +bool progressPersistedIsUpToDate(struct raft *r, unsigned i) +{ + struct raft_progress *p = &r->leader_state.progress[i]; + raft_index last_index = logLastIndex(r->log); + return p->match_index == last_index; +} + +bool progressShouldReplicate(struct raft *r, unsigned i) +{ + struct raft_progress *p = &r->leader_state.progress[i]; + raft_time now = r->io->time(r->io); + bool needs_heartbeat = now - p->last_send >= r->heartbeat_timeout; + raft_index last_index = logLastIndex(r->log); + bool result = false; + + /* We must be in a valid state. */ + assert(p->state == PROGRESS__PROBE || p->state == PROGRESS__PIPELINE || + p->state == PROGRESS__SNAPSHOT); + + /* The next index to send must be lower than the highest index in our + * log. */ + assert(p->next_index <= last_index + 1); + + switch (p->state) { + case PROGRESS__SNAPSHOT: + /* Snapshot timed out, move to PROBE */ + if (now - p->snapshot_last_send >= + r->install_snapshot_timeout) { + tracef("snapshot timed out for index:%u", i); + result = true; + progressAbortSnapshot(r, i); + } else { + /* Enforce Leadership during follower Snapshot + * installation */ + result = needs_heartbeat; + } + break; + case PROGRESS__PROBE: + /* We send at most one message per heartbeat interval. + */ + result = needs_heartbeat; + break; + case PROGRESS__PIPELINE: + /* In replication mode we send empty append entries + * messages only if haven't sent anything in the last + * heartbeat interval. */ + result = !progressIsUpToDate(r, i) || needs_heartbeat; + break; + } + return result; +} + +raft_index progressNextIndex(struct raft *r, unsigned i) +{ + return r->leader_state.progress[i].next_index; +} + +raft_index progressMatchIndex(struct raft *r, unsigned i) +{ + return r->leader_state.progress[i].match_index; +} + +void progressUpdateLastSend(struct raft *r, unsigned i) +{ + r->leader_state.progress[i].last_send = r->io->time(r->io); +} + +void progressUpdateSnapshotLastSend(struct raft *r, unsigned i) +{ + r->leader_state.progress[i].snapshot_last_send = r->io->time(r->io); +} + +bool progressResetRecentRecv(struct raft *r, const unsigned i) +{ + bool prev = r->leader_state.progress[i].recent_recv; + r->leader_state.progress[i].recent_recv = false; + return prev; +} + +void progressMarkRecentRecv(struct raft *r, const unsigned i) +{ + r->leader_state.progress[i].recent_recv = true; +} + +inline void progressSetFeatures(struct raft *r, + const unsigned i, + raft_flags features) +{ + r->leader_state.progress[i].features = features; +} + +inline raft_flags progressGetFeatures(struct raft *r, const unsigned i) +{ + return r->leader_state.progress[i].features; +} + +bool progressGetRecentRecv(const struct raft *r, const unsigned i) +{ + return r->leader_state.progress[i].recent_recv; +} + +void progressToSnapshot(struct raft *r, unsigned i) +{ + struct raft_progress *p = &r->leader_state.progress[i]; + p->state = PROGRESS__SNAPSHOT; + p->snapshot_index = logSnapshotIndex(r->log); +} + +void progressAbortSnapshot(struct raft *r, const unsigned i) +{ + struct raft_progress *p = &r->leader_state.progress[i]; + p->snapshot_index = 0; + p->state = PROGRESS__PROBE; +} + +int progressState(struct raft *r, const unsigned i) +{ + struct raft_progress *p = &r->leader_state.progress[i]; + return p->state; +} + +bool progressMaybeDecrement(struct raft *r, + const unsigned i, + raft_index rejected, + raft_index last_index) +{ + struct raft_progress *p = &r->leader_state.progress[i]; + + assert(p->state == PROGRESS__PROBE || p->state == PROGRESS__PIPELINE || + p->state == PROGRESS__SNAPSHOT); + + if (p->state == PROGRESS__SNAPSHOT) { + /* The rejection must be stale or spurious if the rejected index + * does not match the last snapshot index. */ + if (rejected != p->snapshot_index) { + return false; + } + progressAbortSnapshot(r, i); + return true; + } + + if (p->state == PROGRESS__PIPELINE) { + /* The rejection must be stale if the rejected index is smaller + * than the matched one. */ + if (rejected <= p->match_index) { + tracef("match index is up to date -> ignore "); + return false; + } + /* Directly decrease next to match + 1 */ + p->next_index = min(rejected, p->match_index + 1); + progressToProbe(r, i); + return true; + } + + /* The rejection must be stale or spurious if the rejected index does + * not match the next index minus one. */ + if (rejected != p->next_index - 1) { + tracef( + "rejected index %llu different from next index %lld -> " + "ignore ", + rejected, p->next_index); + return false; + } + + p->next_index = min(rejected, last_index + 1); + p->next_index = max(p->next_index, 1); + + return true; +} + +void progressOptimisticNextIndex(struct raft *r, + unsigned i, + raft_index next_index) +{ + struct raft_progress *p = &r->leader_state.progress[i]; + p->next_index = next_index; +} + +bool progressMaybeUpdate(struct raft *r, unsigned i, raft_index last_index) +{ + struct raft_progress *p = &r->leader_state.progress[i]; + bool updated = false; + if (p->match_index < last_index) { + p->match_index = last_index; + updated = true; + } + if (p->next_index < last_index + 1) { + p->next_index = last_index + 1; + } + return updated; +} + +void progressToProbe(struct raft *r, const unsigned i) +{ + struct raft_progress *p = &r->leader_state.progress[i]; + + /* If the current state is snapshot, we know that the pending snapshot + * has been sent to this peer successfully, so we probe from + * snapshot_index + 1.*/ + if (p->state == PROGRESS__SNAPSHOT) { + assert(p->snapshot_index > 0); + p->next_index = max(p->match_index + 1, p->snapshot_index); + p->snapshot_index = 0; + } else { + p->next_index = p->match_index + 1; + } + p->state = PROGRESS__PROBE; +} + +void progressToPipeline(struct raft *r, const unsigned i) +{ + struct raft_progress *p = &r->leader_state.progress[i]; + p->state = PROGRESS__PIPELINE; +} + +bool progressSnapshotDone(struct raft *r, const unsigned i) +{ + struct raft_progress *p = &r->leader_state.progress[i]; + assert(p->state == PROGRESS__SNAPSHOT); + return p->match_index >= p->snapshot_index; +} + +#undef tracef diff --git a/src/raft/progress.h b/src/raft/progress.h new file mode 100644 index 000000000..b1de1a0e6 --- /dev/null +++ b/src/raft/progress.h @@ -0,0 +1,139 @@ +/* Track replication progress on followers. */ + +#ifndef PROGRESS_H_ +#define PROGRESS_H_ + +#include "../raft.h" + +/* Possible values for the state field of struct raft_progress. */ +enum { + PROGRESS__PROBE = + 0, /* At most one AppendEntries per heartbeat interval */ + PROGRESS__PIPELINE, /* Optimistically stream AppendEntries */ + PROGRESS__SNAPSHOT /* Sending a snapshot */ +}; + +/** + * Used by leaders to keep track of replication progress for each server. + */ +struct raft_progress +{ + unsigned short state; /* Probe, pipeline or snapshot. */ + raft_index next_index; /* Next entry to send. */ + raft_index match_index; /* Highest index reported as replicated. */ + raft_index + snapshot_index; /* Last index of most recent snapshot sent. */ + raft_time last_send; /* Timestamp of last AppendEntries RPC. */ + raft_time + snapshot_last_send; /* Timestamp of last InstallSnaphot RPC. */ + bool recent_recv; /* A msg was received within election timeout. */ + raft_flags features; /* What the server is capable of. */ +}; + +/* Create and initialize the array of progress objects used by the leader to * + * track followers. The match index will be set to zero, and the next index to + * the current last index plus 1. */ +int progressBuildArray(struct raft *r); + +/* Re-build the progress array against a new configuration. + * + * Progress information for servers existing both in the new and in the current + * configuration will remain unchanged. + * + * Progress information for servers existing only in the new configuration will + * be initialized as in progressBuildArray().*/ +int progressRebuildArray(struct raft *r, + const struct raft_configuration *configuration); + +/* Whether the i'th server in the configuration has been sent all the log + * entries. */ +bool progressIsUpToDate(struct raft *r, unsigned i); + +/* Whether the persisted log of the i'th server in the configuration up-to-date + * with ours. */ +bool progressPersistedIsUpToDate(struct raft *r, unsigned i); + +/* Whether a new AppendEntries or InstallSnapshot message should be sent to the + * i'th server at this time. + * + * See the docstring of replicationProgress() for details about how the decision + * is taken. */ +bool progressShouldReplicate(struct raft *r, unsigned i); + +/* Return the index of the next entry that should be sent to the i'th server. */ +raft_index progressNextIndex(struct raft *r, unsigned i); + +/* Return the index of the most recent entry that the i'th server has reported + * as replicated. */ +raft_index progressMatchIndex(struct raft *r, unsigned i); + +/* Update the last_send timestamp after an AppendEntries request has been + * sent. */ +void progressUpdateLastSend(struct raft *r, unsigned i); + +/* Update the snapshot_last_send timestamp after an InstallSnaphot request has + * been sent. */ +void progressUpdateSnapshotLastSend(struct raft *r, unsigned i); + +/* Reset to false the recent_recv flag of the server at the given index, + * returning the previous value. + * + * To be called once every election_timeout milliseconds. */ +bool progressResetRecentRecv(struct raft *r, unsigned i); + +/* Set to true the recent_recv flag of the server at the given index. + * + * To be called whenever we receive an AppendEntries RPC result */ +void progressMarkRecentRecv(struct raft *r, unsigned i); + +/* Return the value of the recent_recv flag. */ +bool progressGetRecentRecv(const struct raft *r, unsigned i); + +/* Convert to the i'th server to snapshot mode. */ +void progressToSnapshot(struct raft *r, unsigned i); + +/* Convert to probe mode. */ +void progressToProbe(struct raft *r, unsigned i); + +/* Convert to pipeline mode. */ +void progressToPipeline(struct raft *r, unsigned i); + +/* Abort snapshot mode and switch to back to probe. + * + * Called after sending the snapshot has failed or timed out. */ +void progressAbortSnapshot(struct raft *r, unsigned i); + +/* Return the progress mode code for the i'th server. */ +int progressState(struct raft *r, unsigned i); + +/* Optimistically update the next index of the given server. + * + * Called in pipeline mode after sending new entries. */ +void progressOptimisticNextIndex(struct raft *r, + unsigned i, + raft_index next_index); + +/* Return false if the given @index comes from an outdated message. Otherwise + * update the progress and returns true. To be called when receiving a + * successful AppendEntries RPC response. */ +bool progressMaybeUpdate(struct raft *r, unsigned i, raft_index last_index); + +/* Return false if the given rejected index comes from an out of order + * message. Otherwise decrease the progress next index to min(rejected, + * last_index) and returns true. To be called when receiving an unsuccessful + * AppendEntries RPC response. */ +bool progressMaybeDecrement(struct raft *r, + unsigned i, + raft_index rejected, + raft_index last_index); + +/* Return true if match_index is equal or higher than the snapshot_index. */ +bool progressSnapshotDone(struct raft *r, unsigned i); + +/* Sets the feature flags of a node. */ +void progressSetFeatures(struct raft *r, const unsigned i, raft_flags features); + +/* Gets the feature flags of a node. */ +raft_flags progressGetFeatures(struct raft *r, const unsigned i); + +#endif /* PROGRESS_H_ */ diff --git a/src/raft/queue.h b/src/raft/queue.h new file mode 100644 index 000000000..1262cf554 --- /dev/null +++ b/src/raft/queue.h @@ -0,0 +1,57 @@ +#ifndef QUEUE_H_ +#define QUEUE_H_ + +#include + +typedef void *queue[2]; + +/* Private macros. */ +#define QUEUE_NEXT(q) (*(queue **)&((*(q))[0])) +#define QUEUE_PREV(q) (*(queue **)&((*(q))[1])) + +#define QUEUE_PREV_NEXT(q) (QUEUE_NEXT(QUEUE_PREV(q))) +#define QUEUE_NEXT_PREV(q) (QUEUE_PREV(QUEUE_NEXT(q))) + +/* Initialize an empty queue. */ +#define QUEUE_INIT(q) \ + { \ + QUEUE_NEXT(q) = (q); \ + QUEUE_PREV(q) = (q); \ + } + +/* Return true if the queue has no element. */ +#define QUEUE_IS_EMPTY(q) ((const queue *)(q) == (const queue *)QUEUE_NEXT(q)) + +/* Insert an element at the back of a queue. */ +#define QUEUE_PUSH(q, e) \ + { \ + QUEUE_NEXT(e) = (q); \ + QUEUE_PREV(e) = QUEUE_PREV(q); \ + QUEUE_PREV_NEXT(e) = (e); \ + QUEUE_PREV(q) = (e); \ + } + +/* Remove the given element from the queue. Any element can be removed at any * + * time. */ +#define QUEUE_REMOVE(e) \ + { \ + QUEUE_PREV_NEXT(e) = QUEUE_NEXT(e); \ + QUEUE_NEXT_PREV(e) = QUEUE_PREV(e); \ + } + +/* Return the element at the front of the queue. */ +#define QUEUE_HEAD(q) (QUEUE_NEXT(q)) + +/* Return the element at the back of the queue. */ +#define QUEUE_TAIL(q) (QUEUE_PREV(q)) + +/* Iterate over the element of a queue. * Mutating the queue while iterating + * results in undefined behavior. */ +#define QUEUE_FOREACH(q, e) \ + for ((q) = QUEUE_NEXT(e); (q) != (e); (q) = QUEUE_NEXT(q)) + +/* Return the structure holding the given element. */ +#define QUEUE_DATA(e, type, field) \ + ((type *)((void *)((char *)(e)-offsetof(type, field)))) + +#endif /* QUEUE_H_*/ diff --git a/src/raft/raft.c b/src/raft/raft.c new file mode 100644 index 000000000..e1ff0c41b --- /dev/null +++ b/src/raft/raft.c @@ -0,0 +1,304 @@ +#include "../raft.h" + +#include + +#include "../tracing.h" +#include "assert.h" +#include "byte.h" +#include "callbacks.h" +#include "configuration.h" +#include "convert.h" +#include "election.h" +#include "err.h" +#include "flags.h" +#include "heap.h" +#include "log.h" +#include "membership.h" + +#define DEFAULT_ELECTION_TIMEOUT 1000 /* One second */ +#define DEFAULT_HEARTBEAT_TIMEOUT 100 /* One tenth of a second */ +#define DEFAULT_INSTALL_SNAPSHOT_TIMEOUT 30000 /* 30 seconds */ +#define DEFAULT_SNAPSHOT_THRESHOLD 1024 +#define DEFAULT_SNAPSHOT_TRAILING 2048 + +/* Number of milliseconds after which a server promotion will be aborted if the + * server hasn't caught up with the logs yet. */ +#define DEFAULT_MAX_CATCH_UP_ROUNDS 10 +#define DEFAULT_MAX_CATCH_UP_ROUND_DURATION (5 * 1000) + +int raft_version_number(void) +{ + return RAFT_VERSION_NUMBER; +} + +static int ioFsmVersionCheck(struct raft *r, + struct raft_io *io, + struct raft_fsm *fsm); + +int raft_init(struct raft *r, + struct raft_io *io, + struct raft_fsm *fsm, + const raft_id id, + const char *address) +{ + int rv; + assert(r != NULL); + + rv = ioFsmVersionCheck(r, io, fsm); + if (rv != 0) { + goto err; + } + + r->io = io; + r->io->data = r; + r->fsm = fsm; + + r->tracer = NULL; + + r->id = id; + /* Make a copy of the address */ + r->address = RaftHeapMalloc(strlen(address) + 1); + if (r->address == NULL) { + rv = RAFT_NOMEM; + goto err; + } + strcpy(r->address, address); + r->current_term = 0; + r->voted_for = 0; + r->log = logInit(); + if (r->log == NULL) { + rv = RAFT_NOMEM; + goto err_after_address_alloc; + } + + raft_configuration_init(&r->configuration); + raft_configuration_init(&r->configuration_last_snapshot); + r->configuration_committed_index = 0; + r->configuration_uncommitted_index = 0; + r->election_timeout = DEFAULT_ELECTION_TIMEOUT; + r->heartbeat_timeout = DEFAULT_HEARTBEAT_TIMEOUT; + r->install_snapshot_timeout = DEFAULT_INSTALL_SNAPSHOT_TIMEOUT; + r->commit_index = 0; + r->last_applied = 0; + r->last_stored = 0; + r->state = RAFT_UNAVAILABLE; + r->leader_state.voter_contacts = 0; + rv = raftInitCallbacks(r); + if (rv != 0) { + goto err_after_address_alloc; + } + r->transfer = NULL; + r->snapshot.pending.term = 0; + r->snapshot.threshold = DEFAULT_SNAPSHOT_THRESHOLD; + r->snapshot.trailing = DEFAULT_SNAPSHOT_TRAILING; + r->snapshot.put.data = NULL; + r->close_cb = NULL; + memset(r->errmsg, 0, sizeof r->errmsg); + r->pre_vote = false; + r->max_catch_up_rounds = DEFAULT_MAX_CATCH_UP_ROUNDS; + r->max_catch_up_round_duration = DEFAULT_MAX_CATCH_UP_ROUND_DURATION; + rv = r->io->init(r->io, r->id, r->address); + if (rv != 0) { + ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); + goto err_after_callbacks_alloc; + } + return 0; + +err_after_callbacks_alloc: + raftDestroyCallbacks(r); +err_after_address_alloc: + RaftHeapFree(r->address); +err: + assert(rv != 0); + return rv; +} + +static void ioCloseCb(struct raft_io *io) +{ + struct raft *r = io->data; + tracef("io close cb"); + raftDestroyCallbacks(r); + raft_free(r->address); + logClose(r->log); + raft_configuration_close(&r->configuration); + raft_configuration_close(&r->configuration_last_snapshot); + if (r->close_cb != NULL) { + r->close_cb(r); + } +} + +void raft_close(struct raft *r, void (*cb)(struct raft *r)) +{ + assert(r->close_cb == NULL); + if (r->state != RAFT_UNAVAILABLE) { + convertToUnavailable(r); + } + r->close_cb = cb; + r->io->close(r->io, ioCloseCb); +} + +void raft_register_state_cb(struct raft *r, raft_state_cb cb) +{ + struct raft_callbacks *cbs = raftGetCallbacks(r); + assert(cbs != NULL); + cbs->state_cb = cb; +} + +void raft_set_election_timeout(struct raft *r, const unsigned msecs) +{ + r->election_timeout = msecs; +} + +void raft_set_heartbeat_timeout(struct raft *r, const unsigned msecs) +{ + r->heartbeat_timeout = msecs; +} + +void raft_set_install_snapshot_timeout(struct raft *r, const unsigned msecs) +{ + r->install_snapshot_timeout = msecs; +} + +void raft_set_snapshot_threshold(struct raft *r, unsigned n) +{ + r->snapshot.threshold = n; +} + +void raft_set_snapshot_trailing(struct raft *r, unsigned n) +{ + r->snapshot.trailing = n; +} + +void raft_set_max_catch_up_rounds(struct raft *r, unsigned n) +{ + r->max_catch_up_rounds = n; +} + +void raft_set_max_catch_up_round_duration(struct raft *r, unsigned msecs) +{ + r->max_catch_up_round_duration = msecs; +} + +void raft_set_pre_vote(struct raft *r, bool enabled) +{ + r->pre_vote = enabled; +} + +const char *raft_errmsg(struct raft *r) +{ + return r->errmsg; +} + +int raft_voter_contacts(struct raft *r) +{ + int ret; + if (r->state == RAFT_LEADER) { + ret = (int)r->leader_state.voter_contacts; + } else { + ret = -1; + } + return ret; +} + +int raft_bootstrap(struct raft *r, const struct raft_configuration *conf) +{ + int rv; + + if (r->state != RAFT_UNAVAILABLE) { + return RAFT_BUSY; + } + + rv = r->io->bootstrap(r->io, conf); + if (rv != 0) { + return rv; + } + + return 0; +} + +int raft_recover(struct raft *r, const struct raft_configuration *conf) +{ + int rv; + + if (r->state != RAFT_UNAVAILABLE) { + return RAFT_BUSY; + } + + rv = r->io->recover(r->io, conf); + if (rv != 0) { + return rv; + } + + return 0; +} + +const char *raft_strerror(int errnum) +{ + return errCodeToString(errnum); +} + +void raft_configuration_init(struct raft_configuration *c) +{ + configurationInit(c); +} + +void raft_configuration_close(struct raft_configuration *c) +{ + configurationClose(c); +} + +int raft_configuration_add(struct raft_configuration *c, + const raft_id id, + const char *address, + const int role) +{ + return configurationAdd(c, id, address, role); +} + +int raft_configuration_encode(const struct raft_configuration *c, + struct raft_buffer *buf) +{ + return configurationEncode(c, buf); +} + +unsigned long long raft_digest(const char *text, unsigned long long n) +{ + struct byteSha1 sha1; + uint8_t value[20]; + uint64_t n64 = byteFlip64((uint64_t)n); + uint64_t digest; + + byteSha1Init(&sha1); + byteSha1Update(&sha1, (const uint8_t *)text, (uint32_t)strlen(text)); + byteSha1Update(&sha1, (const uint8_t *)&n64, (uint32_t)(sizeof n64)); + byteSha1Digest(&sha1, value); + + memcpy(&digest, value + (sizeof value - sizeof digest), sizeof digest); + + return byteFlip64(digest); +} + +static int ioFsmVersionCheck(struct raft *r, + struct raft_io *io, + struct raft_fsm *fsm) +{ + if (io->version == 0) { + ErrMsgPrintf(r->errmsg, "io->version must be set"); + return -1; + } + + if (fsm->version == 0) { + ErrMsgPrintf(r->errmsg, "fsm->version must be set"); + return -1; + } + + if ((fsm->version > 2 && fsm->snapshot_async != NULL) && + ((io->version < 2) || (io->async_work == NULL))) { + ErrMsgPrintf(r->errmsg, + "async snapshot requires io->version > 1 and " + "async_work method."); + return -1; + } + + return 0; +} diff --git a/src/raft/recv.c b/src/raft/recv.c new file mode 100644 index 000000000..5f0da1723 --- /dev/null +++ b/src/raft/recv.c @@ -0,0 +1,225 @@ +#include "recv.h" + +#include "../tracing.h" +#include "assert.h" +#include "convert.h" +#include "entry.h" +#include "heap.h" +#include "log.h" +#include "membership.h" +#include "recv_append_entries.h" +#include "recv_append_entries_result.h" +#include "recv_install_snapshot.h" +#include "recv_request_vote.h" +#include "recv_request_vote_result.h" +#include "recv_timeout_now.h" +#include "string.h" + +/* Dispatch a single RPC message to the appropriate handler. */ +static int recvMessage(struct raft *r, struct raft_message *message) +{ + int rv = 0; + + switch (message->type) { + case RAFT_IO_APPEND_ENTRIES: + rv = recvAppendEntries(r, message->server_id, + message->server_address, + &message->append_entries); + if (rv != 0) { + entryBatchesDestroy( + message->append_entries.entries, + message->append_entries.n_entries); + } + break; + case RAFT_IO_APPEND_ENTRIES_RESULT: + rv = recvAppendEntriesResult( + r, message->server_id, message->server_address, + &message->append_entries_result); + break; + case RAFT_IO_REQUEST_VOTE: + rv = recvRequestVote(r, message->server_id, + message->server_address, + &message->request_vote); + break; + case RAFT_IO_REQUEST_VOTE_RESULT: + rv = recvRequestVoteResult( + r, message->server_id, message->server_address, + &message->request_vote_result); + break; + case RAFT_IO_INSTALL_SNAPSHOT: + rv = recvInstallSnapshot(r, message->server_id, + message->server_address, + &message->install_snapshot); + /* Already installing a snapshot, wait for it and ignore + * this one */ + if (rv == RAFT_BUSY) { + raft_free(message->install_snapshot.data.base); + raft_configuration_close( + &message->install_snapshot.conf); + rv = 0; + } + break; + case RAFT_IO_TIMEOUT_NOW: + rv = recvTimeoutNow(r, message->server_id, + message->server_address, + &message->timeout_now); + break; + default: + tracef("received unknown message type (%d)", + message->type); + /* Drop message */ + return 0; + }; + + if (rv != 0 && rv != RAFT_NOCONNECTION) { + tracef("recv: %d: %s", message->type, raft_strerror(rv)); + return rv; + } + + /* If there's a leadership transfer in progress, check if it has + * completed. */ + if (r->transfer != NULL) { + if (r->follower_state.current_leader.id == r->transfer->id) { + membershipLeadershipTransferClose(r); + } + } + + return 0; +} + +void recvCb(struct raft_io *io, struct raft_message *message) +{ + struct raft *r = io->data; + int rv; + if (r->state == RAFT_UNAVAILABLE) { + switch (message->type) { + case RAFT_IO_APPEND_ENTRIES: + entryBatchesDestroy( + message->append_entries.entries, + message->append_entries.n_entries); + break; + case RAFT_IO_INSTALL_SNAPSHOT: + raft_configuration_close( + &message->install_snapshot.conf); + raft_free(message->install_snapshot.data.base); + break; + } + return; + } + rv = recvMessage(r, message); + if (rv != 0) { + convertToUnavailable(r); + } +} + +int recvBumpCurrentTerm(struct raft *r, raft_term term) +{ + int rv; + char msg[128]; + + assert(r != NULL); + assert(term > r->current_term); + + sprintf(msg, "remote term %lld is higher than %lld -> bump local term", + term, r->current_term); + if (r->state != RAFT_FOLLOWER) { + strcat(msg, " and step down"); + } + tracef("%s", msg); + + /* Save the new term to persistent store, resetting the vote. */ + rv = r->io->set_term(r->io, term); + if (rv != 0) { + return rv; + } + + /* Update our cache too. */ + r->current_term = term; + r->voted_for = 0; + + if (r->state != RAFT_FOLLOWER) { + /* Also convert to follower. */ + convertToFollower(r); + } + + return 0; +} + +void recvCheckMatchingTerms(struct raft *r, raft_term term, int *match) +{ + if (term < r->current_term) { + *match = -1; + } else if (term > r->current_term) { + *match = 1; + } else { + *match = 0; + } +} + +int recvEnsureMatchingTerms(struct raft *r, raft_term term, int *match) +{ + int rv; + + assert(r != NULL); + assert(match != NULL); + + recvCheckMatchingTerms(r, term, match); + + if (*match == -1) { + tracef("old term - current_term:%llu other_term:%llu", + r->current_term, term); + return 0; + } + + /* From Figure 3.1: + * + * Rules for Servers: All Servers: If RPC request or response contains + * term T > currentTerm: set currentTerm = T, convert to follower. + * + * From state diagram in Figure 3.3: + * + * [leader]: discovers server with higher term -> [follower] + * + * From Section 3.3: + * + * If a candidate or leader discovers that its term is out of date, it + * immediately reverts to follower state. + */ + if (*match == 1) { + rv = recvBumpCurrentTerm(r, term); + if (rv != 0) { + tracef("recvBumpCurrentTerm failed %d", rv); + return rv; + } + } + + return 0; +} + +int recvUpdateLeader(struct raft *r, const raft_id id, const char *address) +{ + assert(r->state == RAFT_FOLLOWER); + + r->follower_state.current_leader.id = id; + + /* If the address of the current leader is the same as the given one, + * we're done. */ + if (r->follower_state.current_leader.address != NULL && + strcmp(address, r->follower_state.current_leader.address) == 0) { + return 0; + } + + if (r->follower_state.current_leader.address != NULL) { + RaftHeapFree(r->follower_state.current_leader.address); + } + r->follower_state.current_leader.address = + RaftHeapMalloc(strlen(address) + 1); + if (r->follower_state.current_leader.address == NULL) { + return RAFT_NOMEM; + } + strcpy(r->follower_state.current_leader.address, address); + + return 0; +} + +#undef tracef diff --git a/src/raft/recv.h b/src/raft/recv.h new file mode 100644 index 000000000..df1fece75 --- /dev/null +++ b/src/raft/recv.h @@ -0,0 +1,44 @@ +/* Receive an RPC message. */ + +#ifndef RECV_H_ +#define RECV_H_ + +#include "../raft.h" + +/* Callback to be passed to the raft_io implementation. It will be invoked upon + * receiving an RPC message. */ +void recvCb(struct raft_io *io, struct raft_message *message); + +/* Compare a request's term with the server's current term. + * + * The match output parameter will be set to 0 if the local term matches the + * request's term, to -1 if the request's term is lower, and to 1 if the + * request's term is higher. */ +void recvCheckMatchingTerms(struct raft *r, raft_term term, int *match); + +/* Bump the current term and possibly step down from candidate or leader + * state. */ +int recvBumpCurrentTerm(struct raft *r, raft_term term); + +/* Common logic for RPC handlers, comparing the request's term with the server's + * current term and possibly deciding to reject the request or step down from + * candidate or leader. + * + * From Section 3.3: + * + * If a candidate or leader discovers that its term is out of date, it + * immediately reverts to follower state. If a server receives a request with + * a stale term number, it rejects the request. + * + * The match output parameter will be set to 0 if the local term matches the + * request's term, to -1 if the request's term is lower, and to 1 if the + * request's term was higher but we have successfully bumped the local one to + * match it (and stepped down to follower in that case, if we were not + * follower already). */ +int recvEnsureMatchingTerms(struct raft *r, raft_term term, int *match); + +/* If different from the current one, update information about the current + * leader. Must be called only by followers. */ +int recvUpdateLeader(struct raft *r, raft_id id, const char *address); + +#endif /* RECV_H_ */ diff --git a/src/raft/recv_append_entries.c b/src/raft/recv_append_entries.c new file mode 100644 index 000000000..7d4adbcc0 --- /dev/null +++ b/src/raft/recv_append_entries.c @@ -0,0 +1,167 @@ +#include "recv_append_entries.h" + +#include "../tracing.h" +#include "assert.h" +#include "convert.h" +#include "entry.h" +#include "flags.h" +#include "heap.h" +#include "log.h" +#include "recv.h" +#include "replication.h" + +static void recvSendAppendEntriesResultCb(struct raft_io_send *req, int status) +{ + (void)status; + RaftHeapFree(req); +} + +int recvAppendEntries(struct raft *r, + raft_id id, + const char *address, + const struct raft_append_entries *args) +{ + struct raft_io_send *req; + struct raft_message message; + struct raft_append_entries_result *result = + &message.append_entries_result; + int match; + bool async; + int rv; + + assert(r != NULL); + assert(id > 0); + assert(args != NULL); + assert(address != NULL); + tracef( + "self:%llu from:%llu@%s leader_commit:%llu n_entries:%d " + "prev_log_index:%llu prev_log_term:%llu, term:%llu", + r->id, id, address, args->leader_commit, args->n_entries, + args->prev_log_index, args->prev_log_term, args->term); + + result->rejected = args->prev_log_index; + result->last_log_index = logLastIndex(r->log); + result->version = RAFT_APPEND_ENTRIES_RESULT_VERSION; + result->features = RAFT_DEFAULT_FEATURE_FLAGS; + + rv = recvEnsureMatchingTerms(r, args->term, &match); + if (rv != 0) { + return rv; + } + + /* From Figure 3.1: + * + * AppendEntries RPC: Receiver implementation: Reply false if term < + * currentTerm. + */ + if (match < 0) { + tracef("local term is higher -> reject "); + goto reply; + } + + /* If we get here it means that the term in the request matches our + * current term or it was higher and we have possibly stepped down, + * because we discovered the current leader: + * + * From Figure 3.1: + * + * Rules for Servers: Candidates: if AppendEntries RPC is received + * from new leader: convert to follower. + * + * From Section 3.4: + * + * While waiting for votes, a candidate may receive an AppendEntries + * RPC from another server claiming to be leader. If the leader's term + * (included in its RPC) is at least as large as the candidate's + * current term, then the candidate recognizes the leader as legitimate + * and returns to follower state. If the term in the RPC is smaller than + * the candidate's current term, then the candidate rejects the RPC and + * continues in candidate state. + * + * From state diagram in Figure 3.3: + * + * [candidate]: discovers current leader -> [follower] + * + * Note that it should not be possible for us to be in leader state, + * because the leader that is sending us the request should have either + * a lower term (and in that case we reject the request above), or a + * higher term (and in that case we step down). It can't have the same + * term because at most one leader can be elected at any given term. + */ + assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE); + assert(r->current_term == args->term); + + if (r->state == RAFT_CANDIDATE) { + /* The current term and the peer one must match, otherwise we + * would have either rejected the request or stepped down to + * followers. */ + assert(match == 0); + tracef("discovered leader -> step down "); + convertToFollower(r); + } + + assert(r->state == RAFT_FOLLOWER); + + /* Update current leader because the term in this AppendEntries RPC is + * up to date. */ + rv = recvUpdateLeader(r, id, address); + if (rv != 0) { + return rv; + } + + /* Reset the election timer. */ + r->election_timer_start = r->io->time(r->io); + + /* If we are installing a snapshot, ignore these entries. TODO: we + * should do something smarter, e.g. buffering the entries in the I/O + * backend, which should be in charge of serializing everything. */ + if (replicationInstallSnapshotBusy(r) && args->n_entries > 0) { + tracef("ignoring AppendEntries RPC during snapshot install"); + entryBatchesDestroy(args->entries, args->n_entries); + return 0; + } + + rv = replicationAppend(r, args, &result->rejected, &async); + if (rv != 0) { + return rv; + } + + if (async) { + return 0; + } + + /* Echo back to the leader the point that we reached. */ + result->last_log_index = r->last_stored; + +reply: + result->term = r->current_term; + + /* Free the entries batch, if any. */ + if (args->n_entries > 0 && args->entries[0].batch != NULL) { + raft_free(args->entries[0].batch); + } + + if (args->entries != NULL) { + raft_free(args->entries); + } + + message.type = RAFT_IO_APPEND_ENTRIES_RESULT; + message.server_id = id; + message.server_address = address; + + req = RaftHeapMalloc(sizeof *req); + if (req == NULL) { + return RAFT_NOMEM; + } + req->data = r; + + rv = r->io->send(r->io, req, &message, recvSendAppendEntriesResultCb); + if (rv != 0) { + raft_free(req); + return rv; + } + + return 0; +} + +#undef tracef diff --git a/src/raft/recv_append_entries.h b/src/raft/recv_append_entries.h new file mode 100644 index 000000000..5b674860f --- /dev/null +++ b/src/raft/recv_append_entries.h @@ -0,0 +1,14 @@ +/* Receive an AppendEntries message. */ + +#ifndef RECV_APPEND_ENTRIES_H_ +#define RECV_APPEND_ENTRIES_H_ + +#include "../raft.h" + +/* Process an AppendEntries RPC from the given server. */ +int recvAppendEntries(struct raft *r, + raft_id id, + const char *address, + const struct raft_append_entries *args); + +#endif /* RECV_APPEND_ENTRIES_H_ */ diff --git a/src/raft/recv_append_entries_result.c b/src/raft/recv_append_entries_result.c new file mode 100644 index 000000000..ddef54f14 --- /dev/null +++ b/src/raft/recv_append_entries_result.c @@ -0,0 +1,75 @@ +#include "recv_append_entries_result.h" +#include "../tracing.h" +#include "assert.h" +#include "configuration.h" +#include "recv.h" +#include "replication.h" + +int recvAppendEntriesResult(struct raft *r, + const raft_id id, + const char *address, + const struct raft_append_entries_result *result) +{ + int match; + const struct raft_server *server; + int rv; + + assert(r != NULL); + assert(id > 0); + assert(address != NULL); + assert(result != NULL); + + tracef( + "self:%llu from:%llu@%s last_log_index:%llu rejected:%llu " + "term:%llu", + r->id, id, address, result->last_log_index, result->rejected, + result->term); + + if (r->state != RAFT_LEADER) { + tracef("local server is not leader -> ignore"); + return 0; + } + + rv = recvEnsureMatchingTerms(r, result->term, &match); + if (rv != 0) { + return rv; + } + + if (match < 0) { + tracef("local term is higher -> ignore "); + return 0; + } + + /* If we have stepped down, abort here. + * + * From Figure 3.1: + * + * [Rules for Servers] All Servers: If RPC request or response + * contains term T > currentTerm: set currentTerm = T, convert to + * follower. + */ + if (match > 0) { + assert(r->state == RAFT_FOLLOWER); + return 0; + } + + assert(result->term == r->current_term); + + /* Ignore responses from servers that have been removed */ + server = configurationGet(&r->configuration, id); + if (server == NULL) { + tracef("unknown server -> ignore"); + return 0; + } + + /* Update the progress of this server, possibly sending further entries. + */ + rv = replicationUpdate(r, server, result); + if (rv != 0) { + return rv; + } + + return 0; +} + +#undef tracef diff --git a/src/raft/recv_append_entries_result.h b/src/raft/recv_append_entries_result.h new file mode 100644 index 000000000..8cf8524ac --- /dev/null +++ b/src/raft/recv_append_entries_result.h @@ -0,0 +1,14 @@ +/* Receive an AppendEntries result message. */ + +#ifndef RECV_APPEND_ENTRIES_RESULT_H_ +#define RECV_APPEND_ENTRIES_RESULT_H_ + +#include "../raft.h" + +/* Process an AppendEntries RPC result from the given server. */ +int recvAppendEntriesResult(struct raft *r, + raft_id id, + const char *address, + const struct raft_append_entries_result *result); + +#endif /* RECV_APPEND_ENTRIES_RESULT_H_ */ diff --git a/src/raft/recv_install_snapshot.c b/src/raft/recv_install_snapshot.c new file mode 100644 index 000000000..d3e1493a2 --- /dev/null +++ b/src/raft/recv_install_snapshot.c @@ -0,0 +1,109 @@ +#include "recv_install_snapshot.h" + +#include "../tracing.h" +#include "assert.h" +#include "convert.h" +#include "flags.h" +#include "log.h" +#include "recv.h" +#include "replication.h" + +static void installSnapshotSendCb(struct raft_io_send *req, int status) +{ + (void)status; + raft_free(req); +} + +int recvInstallSnapshot(struct raft *r, + const raft_id id, + const char *address, + struct raft_install_snapshot *args) +{ + struct raft_io_send *req; + struct raft_message message; + struct raft_append_entries_result *result = + &message.append_entries_result; + int rv; + int match; + bool async; + + assert(address != NULL); + tracef( + "self:%llu from:%llu@%s conf_index:%llu last_index:%llu " + "last_term:%llu " + "term:%llu", + r->id, id, address, args->conf_index, args->last_index, + args->last_term, args->term); + + result->rejected = args->last_index; + result->last_log_index = logLastIndex(r->log); + result->version = RAFT_APPEND_ENTRIES_RESULT_VERSION; + result->features = RAFT_DEFAULT_FEATURE_FLAGS; + + rv = recvEnsureMatchingTerms(r, args->term, &match); + if (rv != 0) { + return rv; + } + + if (match < 0) { + tracef("local term is higher -> reject "); + goto reply; + } + + /* TODO: this logic duplicates the one in the AppendEntries handler */ + assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE); + assert(r->current_term == args->term); + if (r->state == RAFT_CANDIDATE) { + assert(match == 0); + tracef("discovered leader -> step down "); + convertToFollower(r); + } + + rv = recvUpdateLeader(r, id, address); + if (rv != 0) { + return rv; + } + r->election_timer_start = r->io->time(r->io); + + rv = replicationInstallSnapshot(r, args, &result->rejected, &async); + if (rv != 0) { + tracef("replicationInstallSnapshot failed %d", rv); + return rv; + } + + if (async) { + return 0; + } + + if (result->rejected == 0) { + /* Echo back to the leader the point that we reached. */ + result->last_log_index = args->last_index; + } + +reply: + result->term = r->current_term; + + /* Free the snapshot data. */ + raft_configuration_close(&args->conf); + raft_free(args->data.base); + + message.type = RAFT_IO_APPEND_ENTRIES_RESULT; + message.server_id = id; + message.server_address = address; + + req = raft_malloc(sizeof *req); + if (req == NULL) { + return RAFT_NOMEM; + } + req->data = r; + + rv = r->io->send(r->io, req, &message, installSnapshotSendCb); + if (rv != 0) { + raft_free(req); + return rv; + } + + return 0; +} + +#undef tracef diff --git a/src/raft/recv_install_snapshot.h b/src/raft/recv_install_snapshot.h new file mode 100644 index 000000000..957c605b2 --- /dev/null +++ b/src/raft/recv_install_snapshot.h @@ -0,0 +1,14 @@ +/* InstallSnapshot RPC handlers. */ + +#ifndef RECV_INSTALL_SNAPSHOT_H_ +#define RECV_INSTALL_SNAPSHOT_H_ + +#include "../raft.h" + +/* Process an InstallSnapshot RPC from the given server. */ +int recvInstallSnapshot(struct raft *r, + raft_id id, + const char *address, + struct raft_install_snapshot *args); + +#endif /* RECV_INSTALL_SNAPSHOT_H_ */ diff --git a/src/raft/recv_request_vote.c b/src/raft/recv_request_vote.c new file mode 100644 index 000000000..f51742869 --- /dev/null +++ b/src/raft/recv_request_vote.c @@ -0,0 +1,150 @@ +#include "recv_request_vote.h" + +#include "../tracing.h" +#include "assert.h" +#include "election.h" +#include "recv.h" +#include "replication.h" + +static void requestVoteSendCb(struct raft_io_send *req, int status) +{ + (void)status; + raft_free(req); +} + +int recvRequestVote(struct raft *r, + const raft_id id, + const char *address, + const struct raft_request_vote *args) +{ + struct raft_io_send *req; + struct raft_message message; + struct raft_request_vote_result *result = &message.request_vote_result; + bool has_leader; + int match; + int rv; + + assert(r != NULL); + assert(id > 0); + assert(args != NULL); + + tracef( + "self:%llu from:%llu@%s candidate_id:%llu disrupt_leader:%d " + "last_log_index:%llu " + "last_log_term:%llu pre_vote:%d term:%llu", + r->id, id, address, args->candidate_id, args->disrupt_leader, + args->last_log_index, args->last_log_term, args->pre_vote, + args->term); + result->vote_granted = false; + result->pre_vote = args->pre_vote; + result->version = RAFT_REQUEST_VOTE_RESULT_VERSION; + + /* Reject the request if we have a leader. + * + * From Section 4.2.3: + * + * [Removed] servers should not be able to disrupt a leader whose + * cluster is receiving heartbeats. [...] If a server receives a + * RequestVote request within the minimum election timeout of hearing + * from a current leader, it does not update its term or grant its vote + * + * From Section 4.2.3: + * + * This change conflicts with the leadership transfer mechanism as + * described in Chapter 3, in which a server legitimately starts an + * election without waiting an election timeout. In that case, + * RequestVote messages should be processed by other servers even when + * they believe a current cluster leader exists. Those RequestVote + * requests can include a special flag to indicate this behavior ("I + * have permission to disrupt the leader - it told me to!"). + */ + has_leader = r->state == RAFT_LEADER || + (r->state == RAFT_FOLLOWER && + r->follower_state.current_leader.id != 0); + if (has_leader && !args->disrupt_leader) { + tracef("local server has a leader -> reject "); + goto reply; + } + + /* If this is a pre-vote request, don't actually increment our term or + * persist the vote. */ + if (args->pre_vote) { + recvCheckMatchingTerms(r, args->term, &match); + } else { + rv = recvEnsureMatchingTerms(r, args->term, &match); + if (rv != 0) { + return rv; + } + } + + /* Reject the request if we are installing a snapshot. + * + * This condition should only be reachable if the disrupt_leader flag is + * set, since otherwise we wouldn't have passed the have_leader check + * above (follower state is not cleared while a snapshot is being + * installed). */ + if (replicationInstallSnapshotBusy(r)) { + tracef("installing snapshot -> reject (disrupt_leader:%d)", + (int)args->disrupt_leader); + goto reply; + } + + /* From Figure 3.1: + * + * RequestVote RPC: Receiver implementation: Reply false if + * term < currentTerm. + * + */ + if (match < 0) { + tracef("local term is higher -> reject "); + goto reply; + } + + /* Unless this is a pre-vote request, at this point our term must be the + * same as the request term (otherwise we would have rejected the + * request or bumped our term). */ + if (!args->pre_vote) { + tracef("no pre_vote: current_term:%llu term:%llu", + r->current_term, args->term); + assert(r->current_term == args->term); + } + + rv = electionVote(r, args, &result->vote_granted); + if (rv != 0) { + return rv; + } + +reply: + result->term = r->current_term; + /* Nodes don't update their term when seeing a Pre-Vote RequestVote RPC. + * To prevent the candidate from ignoring the response of this node if + * it has a smaller term than the candidate, we include the term of the + * request. The smaller term can occur if this node was partitioned from + * the cluster and has reestablished connectivity. This prevents a + * cluster deadlock when a majority of the nodes is online, but they + * fail to establish quorum because the vote of a former partitioned + * node with a smaller term is needed for majority.*/ + if (args->pre_vote) { + result->term = args->term; + } + + message.type = RAFT_IO_REQUEST_VOTE_RESULT; + message.server_id = id; + message.server_address = address; + + req = raft_malloc(sizeof *req); + if (req == NULL) { + return RAFT_NOMEM; + } + req->data = r; + + rv = r->io->send(r->io, req, &message, requestVoteSendCb); + if (rv != 0) { + raft_free(req); + return rv; + } + + return 0; +} + +#undef tracef diff --git a/src/raft/recv_request_vote.h b/src/raft/recv_request_vote.h new file mode 100644 index 000000000..9f2583e33 --- /dev/null +++ b/src/raft/recv_request_vote.h @@ -0,0 +1,14 @@ +/* RequestVote RPC handler. */ + +#ifndef RECV_REQUEST_VOTE_H_ +#define RECV_REQUEST_VOTE_H_ + +#include "../raft.h" + +/* Process a RequestVote RPC from the given server. */ +int recvRequestVote(struct raft *r, + raft_id id, + const char *address, + const struct raft_request_vote *args); + +#endif /* RECV_REQUEST_VOTE_H_ */ diff --git a/src/raft/recv_request_vote_result.c b/src/raft/recv_request_vote_result.c new file mode 100644 index 000000000..ca7ece487 --- /dev/null +++ b/src/raft/recv_request_vote_result.c @@ -0,0 +1,154 @@ +#include "recv_request_vote_result.h" + +#include "../tracing.h" +#include "assert.h" +#include "configuration.h" +#include "convert.h" +#include "election.h" +#include "recv.h" +#include "replication.h" + +int recvRequestVoteResult(struct raft *r, + raft_id id, + const char *address, + const struct raft_request_vote_result *result) +{ + size_t votes_index; + int match; + int rv; + + (void)address; + + assert(r != NULL); + assert(id > 0); + + tracef( + "self:%llu from:%llu@%s term:%llu vote_granted:%d pre_vote:%d " + "version:%d", + r->id, id, address, result->term, result->vote_granted, + result->pre_vote, result->version); + votes_index = configurationIndexOfVoter(&r->configuration, id); + if (votes_index == r->configuration.n) { + tracef("non-voting or unknown server -> reject"); + return 0; + } + + /* Ignore responses if we are not candidate anymore */ + if (r->state != RAFT_CANDIDATE) { + tracef("local server is not candidate -> ignore"); + return 0; + } + + /* If we're in the pre-vote phase, don't actually increment our term + * right now (we'll do it later, if we start the second phase), and also + * don't step down if the peer is just one term ahead (this is okay as + * in the request we sent our current term plus one). */ + if (r->candidate_state.in_pre_vote) { + recvCheckMatchingTerms(r, result->term, &match); + } else { + rv = recvEnsureMatchingTerms(r, result->term, &match); + if (rv != 0) { + return rv; + } + } + + /* Converted to follower as a result of seeing a higher term. */ + if (r->state != RAFT_CANDIDATE) { + tracef("no longer candidate -> ignore"); + return 0; + } + + if (match < 0) { + /* If the term in the result is older than ours, this is an old + * message we should ignore, because the node who voted for us + * would have obtained our term. This happens if the network is + * pretty choppy. */ + tracef("local term is higher -> ignore"); + return 0; + } + + /* Avoid counting pre-vote votes as regular votes. */ + if (result->version > 1 && result->pre_vote && + !r->candidate_state.in_pre_vote) { + tracef("receive stale pre-vote response -> ignore"); + return 0; + } + + /* This can happen when a candidate wins a pre-vote, bumps its term, + * sends real RequestVote RPCs, crashes, comes online, starts a pre-vote + * and then receives the response to the RequestVote RPC it sent + * out before crashing. */ + if (result->version > 1 && !result->pre_vote && + r->candidate_state.in_pre_vote) { + tracef("receive vote response during pre-vote -> ignore"); + return 0; + } + + /* If we're in the pre-vote phase, check that the peer's is at most one + * term ahead (possibly stepping down). If we're the actual voting + * phase, we expect our term must to be the same as the response term + * (otherwise we would have either ignored the result bumped our term). + */ + if (r->candidate_state.in_pre_vote) { + if (match > 0) { + if (result->term > r->current_term + 1) { + assert(!result->vote_granted); + rv = recvBumpCurrentTerm(r, result->term); + return rv; + } + } + } else { + assert(result->term == r->current_term); + } + + /* If the vote was granted and we reached quorum, convert to leader. + * + * From Figure 3.1: + * + * If votes received from majority of severs: become leader. + * + * From state diagram in Figure 3.3: + * + * [candidate]: receives votes from majority of servers -> [leader] + * + * From Section 3.4: + * + * A candidate wins an election if it receives votes from a majority + * of the servers in the full cluster for the same term. Each server + * will vote for at most one candidate in a given term, on a + * firstcome-first-served basis [...]. Once a candidate wins an + * election, it becomes leader. + */ + if (result->vote_granted) { + if (electionTally(r, votes_index)) { + if (r->candidate_state.in_pre_vote) { + tracef( + "votes quorum reached -> pre-vote " + "successful"); + r->candidate_state.in_pre_vote = false; + rv = electionStart(r); + if (rv != 0) { + return rv; + } + } else { + tracef( + "votes quorum reached -> convert to " + "leader"); + rv = convertToLeader(r); + if (rv != 0) { + return rv; + } + /* Send initial heartbeat. */ + replicationHeartbeat(r); + } + } else { + tracef("votes quorum not reached"); + } + } else { + tracef("vote was not granted"); + } + + return 0; +} + +#undef tracef diff --git a/src/raft/recv_request_vote_result.h b/src/raft/recv_request_vote_result.h new file mode 100644 index 000000000..344f3ef53 --- /dev/null +++ b/src/raft/recv_request_vote_result.h @@ -0,0 +1,14 @@ +/* Receive a RequestVote result. */ + +#ifndef RECV_REQUEST_VOTE_RESULT_H_ +#define RECV_REQUEST_VOTE_RESULT_H_ + +#include "../raft.h" + +/* Process a RequestVote RPC result from the given server. */ +int recvRequestVoteResult(struct raft *r, + raft_id id, + const char *address, + const struct raft_request_vote_result *result); + +#endif /* RAFT_RECV_REQUEST_VOTE_RESULT_H_ */ diff --git a/src/raft/recv_timeout_now.c b/src/raft/recv_timeout_now.c new file mode 100644 index 000000000..c503c7600 --- /dev/null +++ b/src/raft/recv_timeout_now.c @@ -0,0 +1,81 @@ +#include "recv_timeout_now.h" + +#include "../tracing.h" +#include "assert.h" +#include "configuration.h" +#include "convert.h" +#include "log.h" +#include "recv.h" + +int recvTimeoutNow(struct raft *r, + const raft_id id, + const char *address, + const struct raft_timeout_now *args) +{ + const struct raft_server *local_server; + raft_index local_last_index; + raft_term local_last_term; + int match; + int rv; + + assert(r != NULL); + assert(id > 0); + assert(args != NULL); + + (void)address; + + tracef( + "self:%llu from:%llu@%s last_log_index:%llu last_log_term:%llu " + "term:%llu", + r->id, id, address, args->last_log_index, args->last_log_term, + args->term); + /* Ignore the request if we are not voters. */ + local_server = configurationGet(&r->configuration, r->id); + if (local_server == NULL || local_server->role != RAFT_VOTER) { + tracef("non-voter"); + return 0; + } + + /* Ignore the request if we are not follower, or we have different + * leader. */ + if (r->state != RAFT_FOLLOWER || + r->follower_state.current_leader.id != id) { + tracef("Ignore - r->state:%d current_leader.id:%llu", r->state, + r->follower_state.current_leader.id); + return 0; + } + + /* Possibly update our term. Ignore the request if it turns out we have + * a higher term. */ + rv = recvEnsureMatchingTerms(r, args->term, &match); + if (rv != 0) { + return rv; + } + if (match < 0) { + return 0; + } + + /* Ignore the request if we our log is not up-to-date. */ + local_last_index = logLastIndex(r->log); + local_last_term = logLastTerm(r->log); + if (local_last_index != args->last_log_index || + local_last_term != args->last_log_term) { + return 0; + } + + /* Finally, ignore the request if we're working on persisting some + * entries. */ + if (r->follower_state.append_in_flight_count > 0) { + return 0; + } + + /* Convert to candidate and start a new election. */ + rv = convertToCandidate(r, true /* disrupt leader */); + if (rv != 0) { + return rv; + } + + return 0; +} + +#undef tracef diff --git a/src/raft/recv_timeout_now.h b/src/raft/recv_timeout_now.h new file mode 100644 index 000000000..5678c290c --- /dev/null +++ b/src/raft/recv_timeout_now.h @@ -0,0 +1,14 @@ +/* Receive a TimeoutNow message. */ + +#ifndef RECV_TIMEOUT_NOW_H_ +#define RECV_TIMEOUT_NOW_H_ + +#include "../raft.h" + +/* Process a TimeoutNow RPC from the given server. */ +int recvTimeoutNow(struct raft *r, + raft_id id, + const char *address, + const struct raft_timeout_now *args); + +#endif /* RECV_TIMEOUT_NOW_H_ */ diff --git a/src/raft/replication.c b/src/raft/replication.c new file mode 100644 index 000000000..8310feb8b --- /dev/null +++ b/src/raft/replication.c @@ -0,0 +1,1837 @@ +#include + +#include "assert.h" +#include "configuration.h" +#include "convert.h" +#include "entry.h" +#ifdef __GLIBC__ +#include "error.h" +#endif +#include "../tracing.h" +#include "err.h" +#include "flags.h" +#include "heap.h" +#include "lifecycle.h" +#include "log.h" +#include "membership.h" +#include "progress.h" +#include "queue.h" +#include "replication.h" +#include "request.h" +#include "snapshot.h" + +#ifndef max +#define max(a, b) ((a) < (b) ? (b) : (a)) +#endif + +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif + +/* Context of a RAFT_IO_APPEND_ENTRIES request that was submitted with + * raft_io_>send(). */ +struct sendAppendEntries +{ + struct raft *raft; /* Instance sending the entries. */ + struct raft_io_send send; /* Underlying I/O send request. */ + raft_index index; /* Index of the first entry in the request. */ + struct raft_entry *entries; /* Entries referenced in the request. */ + unsigned n; /* Length of the entries array. */ + raft_id server_id; /* Destination server. */ +}; + +/* Callback invoked after request to send an AppendEntries RPC has completed. */ +static void sendAppendEntriesCb(struct raft_io_send *send, const int status) +{ + struct sendAppendEntries *req = send->data; + struct raft *r = req->raft; + unsigned i = configurationIndexOf(&r->configuration, req->server_id); + + if (r->state == RAFT_LEADER && i < r->configuration.n) { + if (status != 0) { + tracef( + "failed to send append entries to server %llu: %s", + req->server_id, raft_strerror(status)); + /* Go back to probe mode. */ + progressToProbe(r, i); + } + } + + /* Tell the log that we're done referencing these entries. */ + logRelease(r->log, req->index, req->entries, req->n); + raft_free(req); +} + +/* Send an AppendEntries message to the i'th server, including all log entries + * from the given point onwards. */ +static int sendAppendEntries(struct raft *r, + const unsigned i, + const raft_index prev_index, + const raft_term prev_term) +{ + struct raft_server *server = &r->configuration.servers[i]; + struct raft_message message; + struct raft_append_entries *args = &message.append_entries; + struct sendAppendEntries *req; + raft_index next_index = prev_index + 1; + int rv; + + args->term = r->current_term; + args->prev_log_index = prev_index; + args->prev_log_term = prev_term; + + /* TODO: implement a limit to the total size of the entries being sent + */ + rv = logAcquire(r->log, next_index, &args->entries, &args->n_entries); + if (rv != 0) { + goto err; + } + + /* From Section 3.5: + * + * The leader keeps track of the highest index it knows to be + * committed, and it includes that index in future AppendEntries RPCs + * (including heartbeats) so that the other servers eventually find out. + * Once a follower learns that a log entry is committed, it applies the + * entry to its local state machine (in log order) + */ + args->leader_commit = r->commit_index; + + tracef( + "send %u entries starting at %llu to server %llu (last index %llu)", + args->n_entries, args->prev_log_index, server->id, + logLastIndex(r->log)); + + message.type = RAFT_IO_APPEND_ENTRIES; + message.server_id = server->id; + message.server_address = server->address; + + req = raft_malloc(sizeof *req); + if (req == NULL) { + rv = RAFT_NOMEM; + goto err_after_entries_acquired; + } + req->raft = r; + req->index = args->prev_log_index + 1; + req->entries = args->entries; + req->n = args->n_entries; + req->server_id = server->id; + + req->send.data = req; + rv = r->io->send(r->io, &req->send, &message, sendAppendEntriesCb); + if (rv != 0) { + goto err_after_req_alloc; + } + + if (progressState(r, i) == PROGRESS__PIPELINE) { + /* Optimistically update progress. */ + progressOptimisticNextIndex(r, i, req->index + req->n); + } + + progressUpdateLastSend(r, i); + return 0; + +err_after_req_alloc: + raft_free(req); +err_after_entries_acquired: + logRelease(r->log, next_index, args->entries, args->n_entries); +err: + assert(rv != 0); + return rv; +} + +/* Context of a RAFT_IO_INSTALL_SNAPSHOT request that was submitted with + * raft_io_>send(). */ +struct sendInstallSnapshot +{ + struct raft *raft; /* Instance sending the snapshot. */ + struct raft_io_snapshot_get get; /* Snapshot get request. */ + struct raft_io_send send; /* Underlying I/O send request. */ + struct raft_snapshot *snapshot; /* Snapshot to send. */ + raft_id server_id; /* Destination server. */ +}; + +static void sendInstallSnapshotCb(struct raft_io_send *send, int status) +{ + struct sendInstallSnapshot *req = send->data; + struct raft *r = req->raft; + const struct raft_server *server; + + server = configurationGet(&r->configuration, req->server_id); + + if (status != 0) { + tracef("send install snapshot: %s", raft_strerror(status)); + if (r->state == RAFT_LEADER && server != NULL) { + unsigned i; + i = configurationIndexOf(&r->configuration, + req->server_id); + progressAbortSnapshot(r, i); + } + } + + snapshotClose(req->snapshot); + raft_free(req->snapshot); + raft_free(req); +} + +static void sendSnapshotGetCb(struct raft_io_snapshot_get *get, + struct raft_snapshot *snapshot, + int status) +{ + struct sendInstallSnapshot *req = get->data; + struct raft *r = req->raft; + struct raft_message message; + struct raft_install_snapshot *args = &message.install_snapshot; + const struct raft_server *server = NULL; + bool progress_state_is_snapshot = false; + unsigned i = 0; + int rv; + + if (status != 0) { + tracef("get snapshot %s", raft_strerror(status)); + goto abort; + } + if (r->state != RAFT_LEADER) { + goto abort_with_snapshot; + } + + server = configurationGet(&r->configuration, req->server_id); + + if (server == NULL) { + /* Probably the server was removed in the meantime. */ + goto abort_with_snapshot; + } + + i = configurationIndexOf(&r->configuration, req->server_id); + progress_state_is_snapshot = progressState(r, i) == PROGRESS__SNAPSHOT; + + if (!progress_state_is_snapshot) { + /* Something happened in the meantime. */ + goto abort_with_snapshot; + } + + assert(snapshot->n_bufs == 1); + + message.type = RAFT_IO_INSTALL_SNAPSHOT; + message.server_id = server->id; + message.server_address = server->address; + + args->term = r->current_term; + args->last_index = snapshot->index; + args->last_term = snapshot->term; + args->conf_index = snapshot->configuration_index; + args->conf = snapshot->configuration; + args->data = snapshot->bufs[0]; + + req->snapshot = snapshot; + req->send.data = req; + + tracef("sending snapshot with last index %llu to %llu", snapshot->index, + server->id); + + rv = r->io->send(r->io, &req->send, &message, sendInstallSnapshotCb); + if (rv != 0) { + goto abort_with_snapshot; + } + + goto out; + +abort_with_snapshot: + snapshotClose(snapshot); + raft_free(snapshot); +abort: + if (r->state == RAFT_LEADER && server != NULL && + progress_state_is_snapshot) { + progressAbortSnapshot(r, i); + } + raft_free(req); +out: + return; +} + +/* Send the latest snapshot to the i'th server */ +static int sendSnapshot(struct raft *r, const unsigned i) +{ + struct raft_server *server = &r->configuration.servers[i]; + struct sendInstallSnapshot *request; + int rv; + + progressToSnapshot(r, i); + + request = raft_malloc(sizeof *request); + if (request == NULL) { + rv = RAFT_NOMEM; + goto err; + } + request->raft = r; + request->server_id = server->id; + request->get.data = request; + + /* TODO: make sure that the I/O implementation really returns the latest + * snapshot *at this time* and not any snapshot that might be stored at + * a later point. Otherwise the progress snapshot_index would be wrong. + */ + rv = r->io->snapshot_get(r->io, &request->get, sendSnapshotGetCb); + if (rv != 0) { + goto err_after_req_alloc; + } + + progressUpdateSnapshotLastSend(r, i); + return 0; + +err_after_req_alloc: + raft_free(request); +err: + progressAbortSnapshot(r, i); + assert(rv != 0); + return rv; +} + +int replicationProgress(struct raft *r, unsigned i) +{ + struct raft_server *server = &r->configuration.servers[i]; + bool progress_state_is_snapshot = + progressState(r, i) == PROGRESS__SNAPSHOT; + raft_index snapshot_index = logSnapshotIndex(r->log); + raft_index next_index = progressNextIndex(r, i); + raft_index prev_index; + raft_term prev_term; + + assert(r->state == RAFT_LEADER); + assert(server->id != r->id); + assert(next_index >= 1); + + if (!progressShouldReplicate(r, i)) { + return 0; + } + + /* From Section 3.5: + * + * When sending an AppendEntries RPC, the leader includes the index + * and term of the entry in its log that immediately precedes the new + * entries. If the follower does not find an entry in its log with the + * same index and term, then it refuses the new entries. The + * consistency check acts as an induction step: the initial empty state + * of the logs satisfies the Log Matching Property, and the consistency + * check preserves the Log Matching Property whenever logs are extended. + * As a result, whenever AppendEntries returns successfully, the leader + * knows that the follower's log is identical to its own log up through + * the new entries (Log Matching Property in Figure 3.2). + */ + if (next_index == 1) { + /* We're including the very first entry, so prevIndex and + * prevTerm are null. If the first entry is not available + * anymore, send the last snapshot if we're not already sending + * one. */ + if (snapshot_index > 0 && !progress_state_is_snapshot) { + raft_index last_index = logLastIndex(r->log); + assert(last_index > 0); /* The log can't be empty */ + goto send_snapshot; + } + prev_index = 0; + prev_term = 0; + } else { + /* Set prevIndex and prevTerm to the index and term of the entry + * at next_index - 1. */ + prev_index = next_index - 1; + prev_term = logTermOf(r->log, prev_index); + /* If the entry is not anymore in our log, send the last + * snapshot if we're not doing so already. */ + if (prev_term == 0 && !progress_state_is_snapshot) { + assert(prev_index < snapshot_index); + tracef("missing entry at index %lld -> send snapshot", + prev_index); + goto send_snapshot; + } + } + + /* Send empty AppendEntries RPC when installing a snaphot */ + if (progress_state_is_snapshot) { + prev_index = logLastIndex(r->log); + prev_term = logLastTerm(r->log); + } + + return sendAppendEntries(r, i, prev_index, prev_term); + +send_snapshot: + if (progressGetRecentRecv(r, i)) { + /* Only send a snapshot when we have heard from the server */ + return sendSnapshot(r, i); + } else { + /* Send empty AppendEntries RPC when we haven't heard from the + * server */ + prev_index = logLastIndex(r->log); + prev_term = logLastTerm(r->log); + return sendAppendEntries(r, i, prev_index, prev_term); + } +} + +/* Possibly trigger I/O requests for newly appended log entries or heartbeats. + * + * This function loops through all followers and triggers replication on them. + * + * It must be called only by leaders. */ +static int triggerAll(struct raft *r) +{ + unsigned i; + int rv; + + assert(r->state == RAFT_LEADER); + + /* Trigger replication for servers we didn't hear from recently. */ + for (i = 0; i < r->configuration.n; i++) { + struct raft_server *server = &r->configuration.servers[i]; + if (server->id == r->id) { + continue; + } + /* Skip spare servers, unless they're being promoted. */ + if (server->role == RAFT_SPARE && + server->id != r->leader_state.promotee_id) { + continue; + } + rv = replicationProgress(r, i); + if (rv != 0 && rv != RAFT_NOCONNECTION) { + /* This is not a critical failure, let's just log it. */ + tracef( + "failed to send append entries to server %llu: %s " + "(%d)", + server->id, raft_strerror(rv), rv); + } + } + + return 0; +} + +int replicationHeartbeat(struct raft *r) +{ + return triggerAll(r); +} + +/* Context for a write log entries request that was submitted by a leader. */ +struct appendLeader +{ + struct raft *raft; /* Instance that has submitted the request */ + raft_index index; /* Index of the first entry in the request. */ + struct raft_entry *entries; /* Entries referenced in the request. */ + unsigned n; /* Length of the entries array. */ + struct raft_io_append req; +}; + +/* Called after a successful append entries I/O request to update the index of + * the last entry stored on disk. Return how many new entries that are still + * present in our in-memory log were stored. */ +static size_t updateLastStored(struct raft *r, + raft_index first_index, + struct raft_entry *entries, + size_t n_entries) +{ + size_t i; + + /* Check which of these entries is still in our in-memory log */ + for (i = 0; i < n_entries; i++) { + struct raft_entry *entry = &entries[i]; + raft_index index = first_index + i; + raft_term local_term = logTermOf(r->log, index); + + /* If we have no entry at this index, or if the entry we have + * now has a different term, it means that this entry got + * truncated, so let's stop here. */ + if (local_term == 0 || + (local_term > 0 && local_term != entry->term)) { + break; + } + + /* If we do have an entry at this index, its term must match the + * one of the entry we wrote on disk. */ + assert(local_term != 0 && local_term == entry->term); + } + + r->last_stored += i; + return i; +} + +/* Get the request matching the given @index and @type, if any. + * The type check is skipped when @type == -1. */ +static struct request *getRequest(struct raft *r, + const raft_index index, + int type) +{ + queue *head; + struct request *req; + + if (r->state != RAFT_LEADER) { + return NULL; + } + QUEUE_FOREACH(head, &r->leader_state.requests) + { + req = QUEUE_DATA(head, struct request, queue); + if (req->index == index) { + if (type != -1) { + assert(req->type == type); + } + lifecycleRequestEnd(r, req); + return req; + } + } + return NULL; +} + +/* Invoked once a disk write request for new entries has been completed. */ +static void appendLeaderCb(struct raft_io_append *append, int status) +{ + struct appendLeader *request = append->data; + struct raft *r = request->raft; + size_t server_index; + raft_index index; + int rv; + + tracef("leader: written %u entries starting at %lld: status %d", + request->n, request->index, status); + + /* In case of a failed disk write, if we were the leader creating these + * entries in the first place, truncate our log too (since we have + * appended these entries to it) and fire the request callbacks. + * + * Afterward, convert immediately to follower state, giving the cluster + * a chance to elect another leader that doesn't have a full disk (or + * whatever caused our write error). */ + if (status != 0) { + ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); + for (unsigned i = 0; i < request->n; i++) { + const struct request *req = + getRequest(r, request->index + i, -1); + if (!req) { + tracef("no request found at index %llu", + request->index + i); + continue; + } + switch (req->type) { + case RAFT_COMMAND: { + struct raft_apply *apply = + (struct raft_apply *)req; + if (apply->cb) { + apply->cb(apply, status, NULL); + } + break; + } + case RAFT_BARRIER: { + struct raft_barrier *barrier = + (struct raft_barrier *)req; + if (barrier->cb) { + barrier->cb(barrier, status); + } + break; + } + case RAFT_CHANGE: { + struct raft_change *change = + (struct raft_change *)req; + if (change->cb) { + change->cb(change, status); + } + break; + } + default: + tracef( + "unknown request type, shutdown."); + assert(false); + break; + } + } + goto out; + } + + updateLastStored(r, request->index, request->entries, request->n); + + /* If we are not leader anymore, just discard the result. */ + if (r->state != RAFT_LEADER) { + tracef("local server is not leader -> ignore write log result"); + goto out; + } + + /* Only update the next index if we are part of the current + * configuration. The only case where this is not true is when we were + * asked to remove ourselves from the cluster. + * + * From Section 4.2.2: + * + * there will be a period of time (while it is committing Cnew) when a + * leader can manage a cluster that does not include itself; it + * replicates log entries but does not count itself in majorities. + */ + server_index = configurationIndexOf(&r->configuration, r->id); + if (server_index < r->configuration.n) { + r->leader_state.progress[server_index].match_index = + r->last_stored; + } + + /* Check if we can commit some new entries. */ + replicationQuorum(r, r->last_stored); + + rv = replicationApply(r); + if (rv != 0) { + /* TODO: just log the error? */ + } + +out: + /* Tell the log that we're done referencing these entries. */ + logRelease(r->log, request->index, request->entries, request->n); + index = request->index; + raft_free(request); + if (status != 0) { + if (index <= logLastIndex(r->log)) { + logTruncate(r->log, index); + } + if (r->state == RAFT_LEADER) { + convertToFollower(r); + } + } +} + +/* Submit a disk write for all entries from the given index onward. */ +static int appendLeader(struct raft *r, raft_index index) +{ + struct raft_entry *entries = NULL; + unsigned n; + struct appendLeader *request; + int rv; + + assert(r->state == RAFT_LEADER); + assert(index > 0); + assert(index > r->last_stored); + + /* Acquire all the entries from the given index onwards. */ + rv = logAcquire(r->log, index, &entries, &n); + if (rv != 0) { + goto err; + } + + /* We expect this function to be called only when there are actually + * some entries to write. */ + if (n == 0) { + assert(false); + tracef("No log entries found at index %llu", index); + ErrMsgPrintf(r->errmsg, "No log entries found at index %llu", + index); + rv = RAFT_SHUTDOWN; + goto err_after_entries_acquired; + } + + /* Allocate a new request. */ + request = raft_malloc(sizeof *request); + if (request == NULL) { + rv = RAFT_NOMEM; + goto err_after_entries_acquired; + } + + request->raft = r; + request->index = index; + request->entries = entries; + request->n = n; + request->req.data = request; + + rv = r->io->append(r->io, &request->req, entries, n, appendLeaderCb); + if (rv != 0) { + ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); + goto err_after_request_alloc; + } + + return 0; + +err_after_request_alloc: + raft_free(request); +err_after_entries_acquired: + logRelease(r->log, index, entries, n); +err: + assert(rv != 0); + return rv; +} + +int replicationTrigger(struct raft *r, raft_index index) +{ + int rv; + + rv = appendLeader(r, index); + if (rv != 0) { + return rv; + } + + return triggerAll(r); +} + +/* Helper to be invoked after a promotion of a non-voting server has been + * requested via @raft_assign and that server has caught up with logs. + * + * This function changes the local configuration marking the server being + * promoted as actually voting, appends the a RAFT_CHANGE entry with the new + * configuration to the local log and triggers its replication. */ +static int triggerActualPromotion(struct raft *r) +{ + raft_index index; + raft_term term = r->current_term; + size_t server_index; + struct raft_server *server; + int old_role; + int rv; + + assert(r->state == RAFT_LEADER); + assert(r->leader_state.promotee_id != 0); + + server_index = configurationIndexOf(&r->configuration, + r->leader_state.promotee_id); + assert(server_index < r->configuration.n); + + server = &r->configuration.servers[server_index]; + + assert(server->role != RAFT_VOTER); + + /* Update our current configuration. */ + old_role = server->role; + server->role = RAFT_VOTER; + + /* Index of the entry being appended. */ + index = logLastIndex(r->log) + 1; + + /* Encode the new configuration and append it to the log. */ + rv = logAppendConfiguration(r->log, term, &r->configuration); + if (rv != 0) { + goto err; + } + + /* Start writing the new log entry to disk and send it to the followers. + */ + rv = replicationTrigger(r, index); + if (rv != 0) { + goto err_after_log_append; + } + + r->leader_state.promotee_id = 0; + r->configuration_uncommitted_index = logLastIndex(r->log); + + return 0; + +err_after_log_append: + logTruncate(r->log, index); + +err: + server->role = old_role; + + assert(rv != 0); + return rv; +} + +int replicationUpdate(struct raft *r, + const struct raft_server *server, + const struct raft_append_entries_result *result) +{ + bool is_being_promoted; + raft_index last_index; + unsigned i; + int rv; + + i = configurationIndexOf(&r->configuration, server->id); + + assert(r->state == RAFT_LEADER); + assert(i < r->configuration.n); + + progressMarkRecentRecv(r, i); + + progressSetFeatures(r, i, result->features); + + /* If the RPC failed because of a log mismatch, retry. + * + * From Figure 3.1: + * + * [Rules for servers] Leaders: + * + * - If AppendEntries fails because of log inconsistency: + * decrement nextIndex and retry. + */ + if (result->rejected > 0) { + bool retry; + retry = progressMaybeDecrement(r, i, result->rejected, + result->last_log_index); + if (retry) { + /* Retry, ignoring errors. */ + tracef("log mismatch -> send old entries to %llu", + server->id); + replicationProgress(r, i); + } + return 0; + } + + /* In case of success the remote server is expected to send us back the + * value of prevLogIndex + len(entriesToAppend). If it has a longer log, + * it might be a leftover from previous terms. */ + last_index = result->last_log_index; + if (last_index > logLastIndex(r->log)) { + last_index = logLastIndex(r->log); + } + + /* If the RPC succeeded, update our counters for this server. + * + * From Figure 3.1: + * + * [Rules for servers] Leaders: + * + * If successful update nextIndex and matchIndex for follower. + */ + if (!progressMaybeUpdate(r, i, last_index)) { + return 0; + } + + switch (progressState(r, i)) { + case PROGRESS__SNAPSHOT: + /* If a snapshot has been installed, transition back to + * probe */ + if (progressSnapshotDone(r, i)) { + progressToProbe(r, i); + } + break; + case PROGRESS__PROBE: + /* Transition to pipeline */ + progressToPipeline(r, i); + } + + /* If the server is currently being promoted and is catching with logs, + * update the information about the current catch-up round, and possibly + * proceed with the promotion. */ + is_being_promoted = r->leader_state.promotee_id != 0 && + r->leader_state.promotee_id == server->id; + if (is_being_promoted) { + bool is_up_to_date = membershipUpdateCatchUpRound(r); + if (is_up_to_date) { + rv = triggerActualPromotion(r); + if (rv != 0) { + return rv; + } + } + } + + /* Check if we can commit some new entries. */ + replicationQuorum(r, last_index); + + rv = replicationApply(r); + if (rv != 0) { + /* TODO: just log the error? */ + } + + /* Abort here we have been removed and we are not leaders anymore. */ + if (r->state != RAFT_LEADER) { + goto out; + } + + /* Get again the server index since it might have been removed from the + * configuration. */ + i = configurationIndexOf(&r->configuration, server->id); + + if (i < r->configuration.n) { + /* If we are transferring leadership to this follower, check if + * its log is now up-to-date and, if so, send it a TimeoutNow + * RPC (unless we already did). */ + if (r->transfer != NULL && r->transfer->id == server->id) { + if (progressPersistedIsUpToDate(r, i) && + r->transfer->send.data == NULL) { + rv = membershipLeadershipTransferStart(r); + if (rv != 0) { + membershipLeadershipTransferClose(r); + } + } + } + /* If this follower is in pipeline mode, send it more entries. + */ + if (progressState(r, i) == PROGRESS__PIPELINE) { + replicationProgress(r, i); + } + } + +out: + return 0; +} + +static void sendAppendEntriesResultCb(struct raft_io_send *req, int status) +{ + (void)status; + RaftHeapFree(req); +} + +static void sendAppendEntriesResult( + struct raft *r, + const struct raft_append_entries_result *result) +{ + struct raft_message message; + struct raft_io_send *req; + int rv; + + assert(r->state == RAFT_FOLLOWER); + message.type = RAFT_IO_APPEND_ENTRIES_RESULT; + message.server_id = r->follower_state.current_leader.id; + message.server_address = r->follower_state.current_leader.address; + message.append_entries_result = *result; + + req = raft_malloc(sizeof *req); + if (req == NULL) { + return; + } + req->data = r; + + rv = r->io->send(r->io, req, &message, sendAppendEntriesResultCb); + if (rv != 0) { + raft_free(req); + } +} + +/* Context for a write log entries request that was submitted by a follower. */ +struct appendFollower +{ + struct raft *raft; /* Instance that has submitted the request */ + raft_index index; /* Index of the first entry in the request. */ + struct raft_append_entries args; + struct raft_io_append req; +}; + +static void appendFollowerCb(struct raft_io_append *req, int status) +{ + struct appendFollower *request = req->data; + struct raft *r = request->raft; + struct raft_append_entries *args = &request->args; + struct raft_append_entries_result result; + size_t i; + size_t j; + int rv; + + tracef("I/O completed on follower: status %d", status); + + assert(args->entries != NULL); + assert(args->n_entries > 0); + + assert(r->state == RAFT_FOLLOWER || r->state == RAFT_UNAVAILABLE); + if (r->state == RAFT_UNAVAILABLE) { + goto out; + } + assert(r->follower_state.append_in_flight_count > 0); + r->follower_state.append_in_flight_count -= 1; + + result.term = r->current_term; + result.version = RAFT_APPEND_ENTRIES_RESULT_VERSION; + result.features = RAFT_DEFAULT_FEATURE_FLAGS; + if (status != 0) { + ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); + result.rejected = args->prev_log_index + 1; + goto respond; + } + + /* We received an InstallSnapshot RPC while these entries were being + * persisted to disk */ + if (replicationInstallSnapshotBusy(r)) { + goto out; + } + + i = updateLastStored(r, request->index, args->entries, args->n_entries); + + /* If none of the entries that we persisted is present anymore in our + * in-memory log, there's nothing to report or to do. We just discard + * them. */ + if (i == 0) { + goto out; + } + + /* Possibly apply configuration changes as uncommitted. */ + for (j = 0; j < i; j++) { + struct raft_entry *entry = &args->entries[j]; + raft_index index = request->index + j; + raft_term local_term = logTermOf(r->log, index); + + assert(local_term != 0 && local_term == entry->term); + + if (entry->type == RAFT_CHANGE) { + rv = membershipUncommittedChange(r, index, entry); + if (rv != 0) { + goto out; + } + } + } + + /* From Figure 3.1: + * + * AppendEntries RPC: Receiver implementation: If leaderCommit > + * commitIndex, set commitIndex = min(leaderCommit, index of last new + * entry). + */ + if (args->leader_commit > r->commit_index && + r->last_stored >= r->commit_index) { + r->commit_index = min(args->leader_commit, r->last_stored); + rv = replicationApply(r); + if (rv != 0) { + goto out; + } + } + + /* If our term number has changed since receiving these entries, + * our current_leader may have changed as well, so don't send a response + * to that server. */ + if (r->current_term != args->term) { + tracef( + "new role or term since receiving entries -> don't " + "respond"); + goto out; + } + + result.rejected = 0; + +respond: + result.last_log_index = r->last_stored; + sendAppendEntriesResult(r, &result); + +out: + logRelease(r->log, request->index, request->args.entries, + request->args.n_entries); + + /* If the write failed, we need to truncate the log. */ + if (status != 0) { + if (request->index <= logLastIndex(r->log)) { + logTruncate(r->log, request->index); + } + } + + raft_free(request); +} + +/* Check the log matching property against an incoming AppendEntries request. + * + * From Figure 3.1: + * + * [AppendEntries RPC] Receiver implementation: + * + * 2. Reply false if log doesn't contain an entry at prevLogIndex whose + * term matches prevLogTerm. + * + * Return 0 if the check passed. + * + * Return 1 if the check did not pass and the request needs to be rejected. + * + * Return -1 if there's a conflict and we need to shutdown. */ +static int checkLogMatchingProperty(struct raft *r, + const struct raft_append_entries *args) +{ + raft_term local_prev_term; + + /* If this is the very first entry, there's nothing to check. */ + if (args->prev_log_index == 0) { + return 0; + } + + local_prev_term = logTermOf(r->log, args->prev_log_index); + if (local_prev_term == 0) { + tracef("no entry at index %llu -> reject", + args->prev_log_index); + return 1; + } + + if (local_prev_term != args->prev_log_term) { + if (args->prev_log_index <= r->commit_index) { + /* Should never happen; something is seriously wrong! */ + tracef( + "conflicting terms %llu and %llu for entry %llu " + "(commit " + "index %llu) -> shutdown", + local_prev_term, args->prev_log_term, + args->prev_log_index, r->commit_index); + return -1; + } + tracef("previous term mismatch -> reject"); + return 1; + } + + return 0; +} + +/* Delete from our log all entries that conflict with the ones in the given + * AppendEntries request. + * + * From Figure 3.1: + * + * [AppendEntries RPC] Receiver implementation: + * + * 3. If an existing entry conflicts with a new one (same index but + * different terms), delete the existing entry and all that follow it. + * + * The i output parameter will be set to the array index of the first new log + * entry that we don't have yet in our log, among the ones included in the given + * AppendEntries request. */ +static int deleteConflictingEntries(struct raft *r, + const struct raft_append_entries *args, + size_t *i) +{ + size_t j; + int rv; + + for (j = 0; j < args->n_entries; j++) { + struct raft_entry *entry = &args->entries[j]; + raft_index entry_index = args->prev_log_index + 1 + j; + raft_term local_term = logTermOf(r->log, entry_index); + + if (local_term > 0 && local_term != entry->term) { + if (entry_index <= r->commit_index) { + /* Should never happen; something is seriously + * wrong! */ + tracef( + "new index conflicts with committed entry " + "-> shutdown"); + return RAFT_SHUTDOWN; + } + + tracef("log mismatch -> truncate (%llu)", entry_index); + + /* Possibly discard uncommitted configuration changes. + */ + if (r->configuration_uncommitted_index >= entry_index) { + rv = membershipRollback(r); + if (rv != 0) { + return rv; + } + } + + /* Delete all entries from this index on because they + * don't match. */ + rv = r->io->truncate(r->io, entry_index); + if (rv != 0) { + return rv; + } + logTruncate(r->log, entry_index); + + /* Drop information about previously stored entries that + * have just been discarded. */ + if (r->last_stored >= entry_index) { + r->last_stored = entry_index - 1; + } + + /* We want to append all entries from here on, replacing + * anything that we had before. */ + break; + } else if (local_term == 0) { + /* We don't have an entry at this index, so we want to + * append this new one and all the subsequent ones. */ + break; + } + } + + *i = j; + + return 0; +} + +int replicationAppend(struct raft *r, + const struct raft_append_entries *args, + raft_index *rejected, + bool *async) +{ + struct appendFollower *request; + int match; + size_t n; + size_t i; + size_t j; + bool reinstated; + int rv; + + assert(r != NULL); + assert(args != NULL); + assert(rejected != NULL); + assert(async != NULL); + + assert(r->state == RAFT_FOLLOWER); + + *rejected = args->prev_log_index; + *async = false; + + /* Check the log matching property. */ + match = checkLogMatchingProperty(r, args); + if (match != 0) { + assert(match == 1 || match == -1); + return match == 1 ? 0 : RAFT_SHUTDOWN; + } + + /* Delete conflicting entries. */ + rv = deleteConflictingEntries(r, args, &i); + if (rv != 0) { + return rv; + } + + *rejected = 0; + + n = args->n_entries - i; /* Number of new entries */ + + /* If this is an empty AppendEntries, there's nothing to write. However + * we still want to check if we can commit some entry. However, don't + * commit anything while a snapshot install is busy, r->last_stored will + * be 0 in that case. + * + * From Figure 3.1: + * + * AppendEntries RPC: Receiver implementation: If leaderCommit > + * commitIndex, set commitIndex = min(leaderCommit, index of last new + * entry). + */ + if (n == 0) { + if ((args->leader_commit > r->commit_index) && + r->last_stored >= r->commit_index && + !replicationInstallSnapshotBusy(r)) { + r->commit_index = + min(args->leader_commit, r->last_stored); + rv = replicationApply(r); + if (rv != 0) { + return rv; + } + } + + return 0; + } + + *async = true; + + request = raft_malloc(sizeof *request); + if (request == NULL) { + rv = RAFT_NOMEM; + goto err; + } + + request->raft = r; + request->args = *args; + /* Index of first new entry */ + request->index = args->prev_log_index + 1 + i; + + /* Update our in-memory log to reflect that we received these entries. + * We'll notify the leader of a successful append once the write entries + * request that we issue below actually completes. */ + for (j = 0; j < n; j++) { + struct raft_entry *entry = &args->entries[i + j]; + + /* We are trying to append an entry at index X with term T to + * our in-memory log. If we've gotten this far, we know that the + * log *logically* has no entry at this index. However, it's + * possible that we're still hanging on to such an entry, + * because we previously tried to append and replicate it, and + * the associated disk write failed, but some send requests are + * still pending that refer to it. Since the log is not capable + * of tracking multiple independent entries that share an index + * and term, we just piggyback on the already-stored entry in + * this case. */ + rv = + logReinstate(r->log, entry->term, entry->type, &reinstated); + if (rv != 0) { + goto err_after_request_alloc; + } else if (reinstated) { + continue; + } + + /* TODO This copy should not strictly be necessary, as the batch + * logic will take care of freeing the batch buffer in which the + * entries are received. However, this would lead to memory + * spikes in certain edge cases. + * https://github.com/canonical/dqlite/issues/276 + */ + struct raft_entry copy = {0}; + rv = entryCopy(entry, ©); + if (rv != 0) { + goto err_after_request_alloc; + } + + rv = logAppend(r->log, copy.term, copy.type, ©.buf, NULL); + if (rv != 0) { + goto err_after_request_alloc; + } + } + + /* Acquire the relevant entries from the log. */ + rv = logAcquire(r->log, request->index, &request->args.entries, + &request->args.n_entries); + if (rv != 0) { + goto err_after_request_alloc; + } + + assert(request->args.n_entries == n); + if (request->args.n_entries == 0) { + tracef("No log entries found at index %llu", request->index); + ErrMsgPrintf(r->errmsg, "No log entries found at index %llu", + request->index); + rv = RAFT_SHUTDOWN; + goto err_after_acquire_entries; + } + + request->req.data = request; + rv = r->io->append(r->io, &request->req, request->args.entries, + request->args.n_entries, appendFollowerCb); + if (rv != 0) { + ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); + goto err_after_acquire_entries; + } + r->follower_state.append_in_flight_count += 1; + + entryBatchesDestroy(args->entries, args->n_entries); + return 0; + +err_after_acquire_entries: + /* Release the entries related to the IO request */ + logRelease(r->log, request->index, request->args.entries, + request->args.n_entries); + +err_after_request_alloc: + /* Release all entries added to the in-memory log, making + * sure the in-memory log and disk don't diverge, leading + * to future log entries not being persisted to disk. + */ + if (j != 0) { + logTruncate(r->log, request->index); + } + raft_free(request); + +err: + assert(rv != 0); + return rv; +} + +struct recvInstallSnapshot +{ + struct raft *raft; + struct raft_snapshot snapshot; + raft_term term; /* Used to check for state transitions. */ +}; + +static void installSnapshotCb(struct raft_io_snapshot_put *req, int status) +{ + struct recvInstallSnapshot *request = req->data; + struct raft *r = request->raft; + struct raft_snapshot *snapshot = &request->snapshot; + struct raft_append_entries_result result; + bool should_respond = true; + int rv; + + /* We avoid converting to candidate state while installing a snapshot. + */ + assert(r->state == RAFT_FOLLOWER || r->state == RAFT_UNAVAILABLE); + + r->snapshot.put.data = NULL; + + result.term = r->current_term; + result.version = RAFT_APPEND_ENTRIES_RESULT_VERSION; + result.features = RAFT_DEFAULT_FEATURE_FLAGS; + result.rejected = 0; + + /* If we are shutting down, let's discard the result. */ + if (r->state == RAFT_UNAVAILABLE) { + tracef( + "shutting down -> discard result of snapshot installation"); + should_respond = false; + goto discard; + } + /* If the request is from a previous term, it means that someone else + * became a candidate while we were installing the snapshot. In that + * case, we want to install the snapshot anyway, but our "current + * leader" may no longer be the same as the server that sent the install + * request, so we shouldn't send a response to that server. */ + if (request->term != r->current_term) { + tracef( + "new term since receiving snapshot -> install but don't " + "respond"); + should_respond = false; + } + + if (status != 0) { + tracef("save snapshot %llu: %s", snapshot->index, + raft_strerror(status)); + goto discard; + } + + /* From Figure 5.3: + * + * 7. Discard the entire log + * 8. Reset state machine using snapshot contents (and load lastConfig + * as cluster configuration). + */ + rv = snapshotRestore(r, snapshot); + if (rv != 0) { + tracef("restore snapshot %llu: %s", snapshot->index, + raft_strerror(status)); + goto discard; + } + + tracef("restored snapshot with last index %llu", snapshot->index); + + goto respond; + +discard: + /* In case of error we must also free the snapshot data buffer and free + * the configuration. */ + result.rejected = snapshot->index; + raft_free(snapshot->bufs[0].base); + raft_free(snapshot->bufs); + raft_configuration_close(&snapshot->configuration); + +respond: + if (should_respond) { + result.last_log_index = r->last_stored; + sendAppendEntriesResult(r, &result); + } + + raft_free(request); +} + +int replicationInstallSnapshot(struct raft *r, + const struct raft_install_snapshot *args, + raft_index *rejected, + bool *async) +{ + struct recvInstallSnapshot *request; + struct raft_snapshot *snapshot; + raft_term local_term; + int rv; + + assert(r->state == RAFT_FOLLOWER); + + *rejected = args->last_index; + *async = false; + + /* If we are taking a snapshot ourselves or installing a snapshot, + * ignore the request, the leader will eventually retry. TODO: we should + * do something smarter. */ + if (r->snapshot.pending.term != 0 || r->snapshot.put.data != NULL) { + *async = true; + tracef("already taking or installing snapshot"); + return RAFT_BUSY; + } + + /* If our last snapshot is more up-to-date, this is a no-op */ + if (r->log->snapshot.last_index >= args->last_index) { + tracef("have more recent snapshot"); + *rejected = 0; + return 0; + } + + /* If we already have all entries in the snapshot, this is a no-op */ + local_term = logTermOf(r->log, args->last_index); + if (local_term != 0 && local_term >= args->last_term) { + tracef("have all entries"); + *rejected = 0; + return 0; + } + + *async = true; + + /* Preemptively update our in-memory state. */ + logRestore(r->log, args->last_index, args->last_term); + + r->last_stored = 0; + + request = raft_malloc(sizeof *request); + if (request == NULL) { + rv = RAFT_NOMEM; + goto err; + } + request->raft = r; + request->term = r->current_term; + + snapshot = &request->snapshot; + snapshot->term = args->last_term; + snapshot->index = args->last_index; + snapshot->configuration_index = args->conf_index; + snapshot->configuration = args->conf; + + snapshot->bufs = raft_malloc(sizeof *snapshot->bufs); + if (snapshot->bufs == NULL) { + rv = RAFT_NOMEM; + goto err_after_request_alloc; + } + snapshot->bufs[0] = args->data; + snapshot->n_bufs = 1; + + assert(r->snapshot.put.data == NULL); + r->snapshot.put.data = request; + rv = r->io->snapshot_put(r->io, + 0 /* zero trailing means replace everything */, + &r->snapshot.put, snapshot, installSnapshotCb); + if (rv != 0) { + tracef("snapshot_put failed %d", rv); + goto err_after_bufs_alloc; + } + + return 0; + +err_after_bufs_alloc: + raft_free(snapshot->bufs); + r->snapshot.put.data = NULL; +err_after_request_alloc: + raft_free(request); +err: + assert(rv != 0); + return rv; +} + +/* Apply a RAFT_COMMAND entry that has been committed. */ +static int applyCommand(struct raft *r, + const raft_index index, + const struct raft_buffer *buf) +{ + struct raft_apply *req; + void *result; + int rv; + rv = r->fsm->apply(r->fsm, buf, &result); + if (rv != 0) { + return rv; + } + + r->last_applied = index; + + req = (struct raft_apply *)getRequest(r, index, RAFT_COMMAND); + if (req != NULL && req->cb != NULL) { + req->cb(req, 0, result); + } + return 0; +} + +/* Fire the callback of a barrier request whose entry has been committed. */ +static void applyBarrier(struct raft *r, const raft_index index) +{ + r->last_applied = index; + + struct raft_barrier *req; + req = (struct raft_barrier *)getRequest(r, index, RAFT_BARRIER); + if (req != NULL && req->cb != NULL) { + req->cb(req, 0); + } +} + +/* Apply a RAFT_CHANGE entry that has been committed. */ +static void applyChange(struct raft *r, const raft_index index) +{ + struct raft_change *req; + + assert(index > 0); + + /* If this is an uncommitted configuration that we had already applied + * when submitting the configuration change (for leaders) or upon + * receiving it via an AppendEntries RPC (for followers), then reset the + * uncommitted index, since that uncommitted configuration is now + * committed. */ + if (r->configuration_uncommitted_index == index) { + tracef("configuration at index:%llu is committed.", index); + r->configuration_uncommitted_index = 0; + } + + r->configuration_committed_index = index; + r->last_applied = index; + + if (r->state == RAFT_LEADER) { + const struct raft_server *server; + req = r->leader_state.change; + r->leader_state.change = NULL; + + /* If we are leader but not part of this new configuration, step + * down. + * + * From Section 4.2.2: + * + * In this approach, a leader that is removed from the + * configuration steps down once the Cnew entry is committed. + */ + server = configurationGet(&r->configuration, r->id); + if (server == NULL || server->role != RAFT_VOTER) { + tracef( + "leader removed from config or no longer voter " + "server: %p", + (void *)server); + convertToFollower(r); + } + + if (req != NULL && req->cb != NULL) { + req->cb(req, 0); + } + } +} + +static bool shouldTakeSnapshot(struct raft *r) +{ + /* If we are shutting down, let's not do anything. */ + if (r->state == RAFT_UNAVAILABLE) { + return false; + } + + /* If a snapshot is already in progress or we're installing a snapshot, + * we don't want to start another one. */ + if (r->snapshot.pending.term != 0 || r->snapshot.put.data != NULL) { + return false; + }; + + /* If we didn't reach the threshold yet, do nothing. */ + if (r->last_applied - r->log->snapshot.last_index < + r->snapshot.threshold) { + return false; + } + + return true; +} + +/* + * When taking a snapshot, ownership of the snapshot data is with raft if + * `snapshot_finalize` is NULL. + */ +static void takeSnapshotClose(struct raft *r, struct raft_snapshot *s) +{ + if (r->fsm->version == 1 || + (r->fsm->version > 1 && r->fsm->snapshot_finalize == NULL)) { + snapshotClose(s); + return; + } + + configurationClose(&s->configuration); + r->fsm->snapshot_finalize(r->fsm, &s->bufs, &s->n_bufs); +} + +static void takeSnapshotCb(struct raft_io_snapshot_put *req, int status) +{ + struct raft *r = req->data; + struct raft_snapshot *snapshot; + int rv; + + r->snapshot.put.data = NULL; + snapshot = &r->snapshot.pending; + + if (status != 0) { + tracef("snapshot %lld at term %lld: %s", snapshot->index, + snapshot->term, raft_strerror(status)); + goto out; + } + + /* Cache the configuration contained in the snapshot. While the snapshot + * was written, new configuration changes could have been committed, + * these changes will not be purged from the log by this snapshot. + * However we still cache the configuration for consistency. */ + configurationClose(&r->configuration_last_snapshot); + rv = configurationCopy(&snapshot->configuration, + &r->configuration_last_snapshot); + if (rv != 0) { + /* TODO: make this a hard fault, because if we have no backup + * and the log was truncated it will be impossible to rollback + * an aborted configuration change. */ + tracef("failed to backup last committed configuration."); + } + logSnapshot(r->log, snapshot->index, r->snapshot.trailing); +out: + takeSnapshotClose(r, snapshot); + r->snapshot.pending.term = 0; +} + +static int putSnapshot(struct raft *r, + struct raft_snapshot *snapshot, + raft_io_snapshot_put_cb cb) +{ + int rv; + assert(r->snapshot.put.data == NULL); + r->snapshot.put.data = r; + rv = r->io->snapshot_put(r->io, r->snapshot.trailing, &r->snapshot.put, + snapshot, cb); + if (rv != 0) { + takeSnapshotClose(r, snapshot); + r->snapshot.pending.term = 0; + r->snapshot.put.data = NULL; + } + + return rv; +} + +static void takeSnapshotDoneCb(struct raft_io_async_work *take, int status) +{ + struct raft *r = take->data; + struct raft_snapshot *snapshot = &r->snapshot.pending; + int rv; + + raft_free(take); + + if (status != 0) { + tracef("take snapshot failed %s", raft_strerror(status)); + takeSnapshotClose(r, snapshot); + r->snapshot.pending.term = 0; + r->snapshot.put.data = NULL; + return; + } + + rv = putSnapshot(r, snapshot, takeSnapshotCb); + if (rv != 0) { + tracef("put snapshot failed %d", rv); + } +} + +static int takeSnapshotAsync(struct raft_io_async_work *take) +{ + struct raft *r = take->data; + tracef("take snapshot async at %lld", r->snapshot.pending.index); + struct raft_snapshot *snapshot = &r->snapshot.pending; + return r->fsm->snapshot_async(r->fsm, &snapshot->bufs, + &snapshot->n_bufs); +} + +static int takeSnapshot(struct raft *r) +{ + struct raft_snapshot *snapshot; + int rv; + + tracef("take snapshot at %lld", r->last_applied); + + snapshot = &r->snapshot.pending; + snapshot->index = r->last_applied; + snapshot->term = logTermOf(r->log, r->last_applied); + snapshot->bufs = NULL; + snapshot->n_bufs = 0; + + rv = membershipFetchLastCommittedConfiguration( + r, &snapshot->configuration); + if (rv != 0) { + goto abort; + } + snapshot->configuration_index = r->configuration_committed_index; + + rv = r->fsm->snapshot(r->fsm, &snapshot->bufs, &snapshot->n_bufs); + if (rv != 0) { + /* Ignore transient errors. We'll retry next time. */ + if (rv == RAFT_BUSY) { + rv = 0; + } + raft_configuration_close(&snapshot->configuration); + goto abort; + } + + bool sync_snapshot = + r->fsm->version < 3 || r->fsm->snapshot_async == NULL; + if (sync_snapshot) { + /* putSnapshot will clean up config and buffers in case of error + */ + return putSnapshot(r, snapshot, takeSnapshotCb); + } else { + struct raft_io_async_work *take = raft_malloc(sizeof(*take)); + if (take == NULL) { + rv = RAFT_NOMEM; + goto abort_after_snapshot; + } + take->data = r; + take->work = takeSnapshotAsync; + rv = r->io->async_work(r->io, take, takeSnapshotDoneCb); + if (rv != 0) { + raft_free(take); + goto abort_after_snapshot; + } + } + + return 0; + +abort_after_snapshot: + /* Closes config and finalizes snapshot */ + takeSnapshotClose(r, snapshot); +abort: + r->snapshot.pending.term = 0; + return rv; +} + +int replicationApply(struct raft *r) +{ + raft_index index; + int rv = 0; + + assert(r->state == RAFT_LEADER || r->state == RAFT_FOLLOWER); + assert(r->last_applied <= r->commit_index); + + if (r->last_applied == r->commit_index) { + /* Nothing to do. */ + return 0; + } + + for (index = r->last_applied + 1; index <= r->commit_index; index++) { + const struct raft_entry *entry = logGet(r->log, index); + if (entry == NULL) { + /* This can happen while installing a snapshot */ + tracef("replicationApply - ENTRY NULL"); + return 0; + } + + assert(entry->type == RAFT_COMMAND || + entry->type == RAFT_BARRIER || + entry->type == RAFT_CHANGE); + + switch (entry->type) { + case RAFT_COMMAND: + rv = applyCommand(r, index, &entry->buf); + break; + case RAFT_BARRIER: + applyBarrier(r, index); + rv = 0; + break; + case RAFT_CHANGE: + applyChange(r, index); + rv = 0; + break; + default: + rv = 0; /* For coverity. This case can't be + taken. */ + break; + } + + if (rv != 0) { + break; + } + } + + if (shouldTakeSnapshot(r)) { + rv = takeSnapshot(r); + } + + return rv; +} + +void replicationQuorum(struct raft *r, const raft_index index) +{ + size_t votes = 0; + size_t i; + raft_term term; + + assert(r->state == RAFT_LEADER); + + if (index <= r->commit_index) { + return; + } + + term = logTermOf(r->log, index); + + /* TODO: fuzzy-test --seed 0x8db5fccc replication/entries/partitioned + * fails the assertion below. */ + if (term == 0) { + return; + } + // assert(logTermOf(r->log, index) > 0); + assert(!(term > r->current_term)); + + /* Don't commit entries from previous terms by counting replicas. */ + if (term < r->current_term) { + return; + } + + for (i = 0; i < r->configuration.n; i++) { + struct raft_server *server = &r->configuration.servers[i]; + if (server->role != RAFT_VOTER) { + continue; + } + if (r->leader_state.progress[i].match_index >= index) { + votes++; + } + } + + if (votes > configurationVoterCount(&r->configuration) / 2) { + r->commit_index = index; + tracef("new commit index %llu", r->commit_index); + } + + return; +} + +inline bool replicationInstallSnapshotBusy(struct raft *r) +{ + return r->last_stored == 0 && r->snapshot.put.data != NULL; +} + +#undef tracef diff --git a/src/raft/replication.h b/src/raft/replication.h new file mode 100644 index 000000000..5bfe07dbe --- /dev/null +++ b/src/raft/replication.h @@ -0,0 +1,98 @@ +/* Log replication logic and helpers. */ + +#ifndef REPLICATION_H_ +#define REPLICATION_H_ + +#include "../raft.h" + +/* Send AppendEntries RPC messages to all followers to which no AppendEntries + * was sent in the last heartbeat interval. */ +int replicationHeartbeat(struct raft *r); + +/* Start a local disk write for entries from the given index onwards, and + * trigger replication against all followers, typically sending AppendEntries + * RPC messages with outstanding log entries. */ +int replicationTrigger(struct raft *r, raft_index index); + +/* Possibly send an AppendEntries or an InstallSnapshot RPC message to the + * server with the given index. + * + * The rules to decide whether or not to send a message are: + * + * - If we have sent an InstallSnapshot RPC recently and we haven't yet received + * a response, then don't send any new message. + * + * - If we are probing the follower (i.e. we haven't received a successful + * response during the last heartbeat interval), then send a message only if + * haven't sent any during the last heartbeat interval. + * + * - If we are pipelining entries to the follower, then send any new entries + * haven't yet sent. + * + * If a message should be sent, the rules to decide what type of message to send + * and what it should contain are: + * + * - If we don't have anymore the first entry that should be sent to the + * follower, then send an InstallSnapshot RPC with the last snapshot. + * + * - If we still have the first entry to send, then send all entries from that + index onward (possibly zero). + * + * This function must be called only by leaders. */ +int replicationProgress(struct raft *r, unsigned i); + +/* Update the replication state (match and next indexes) for the given server + * using the given AppendEntries RPC result. + * + * Possibly send to the server a new set of entries or a snapshot if the result + * was unsuccessful because of missing entries or if new entries were added to + * our log in the meantime. + * + * It must be called only by leaders. */ +int replicationUpdate(struct raft *r, + const struct raft_server *server, + const struct raft_append_entries_result *result); + +/* Append the log entries in the given request if the Log Matching Property is + * satisfied. + * + * The rejected output parameter will be set to 0 if the Log Matching Property + * was satisfied, or to args->prev_log_index if not. + * + * The async output parameter will be set to true if some of the entries in the + * request were not present in our log, and a disk write was started to persist + * them to disk. The entries will still be appended immediately to our in-memory + * copy of the log, but an AppendEntries result message will be sent only once + * the disk write completes and the I/O callback is invoked. + * + * It must be called only by followers. */ +int replicationAppend(struct raft *r, + const struct raft_append_entries *args, + raft_index *rejected, + bool *async); + +int replicationInstallSnapshot(struct raft *r, + const struct raft_install_snapshot *args, + raft_index *rejected, + bool *async); + +/* Returns `true` if the raft instance is currently installing a snapshot */ +bool replicationInstallSnapshotBusy(struct raft *r); + +/* Apply any committed entry that was not applied yet. + * + * It must be called by leaders or followers. */ +int replicationApply(struct raft *r); + +/* Check if a quorum has been reached for the given log index, and update the + * commit index accordingly if so. + * + * From Figure 3.1: + * + * [Rules for servers] Leaders: + * + * If there exists an N such that N > commitIndex, a majority of + * matchIndex[i] >= N, and log[N].term == currentTerm: set commitIndex = N */ +void replicationQuorum(struct raft *r, const raft_index index); + +#endif /* REPLICATION_H_ */ diff --git a/src/raft/request.h b/src/raft/request.h new file mode 100644 index 000000000..08ad4a36b --- /dev/null +++ b/src/raft/request.h @@ -0,0 +1,20 @@ +#ifndef REQUEST_H_ +#define REQUEST_H_ + +#include "../raft.h" + +/* Abstract request type */ +struct request +{ + /* Must be kept in sync with RAFT__REQUEST in raft.h */ + void *data; + int type; + raft_index index; + void *queue[2]; + uint8_t req_id[16]; + uint8_t client_id[16]; + uint8_t unique_id[16]; + uint64_t reserved[4]; +}; + +#endif /* REQUEST_H_ */ diff --git a/src/raft/snapshot.c b/src/raft/snapshot.c new file mode 100644 index 000000000..d05994fcb --- /dev/null +++ b/src/raft/snapshot.c @@ -0,0 +1,114 @@ +#include "snapshot.h" + +#include +#include + +#include "../tracing.h" +#include "assert.h" +#include "configuration.h" +#include "err.h" +#include "log.h" + +void snapshotClose(struct raft_snapshot *s) +{ + unsigned i; + configurationClose(&s->configuration); + for (i = 0; i < s->n_bufs; i++) { + raft_free(s->bufs[i].base); + } + raft_free(s->bufs); +} + +void snapshotDestroy(struct raft_snapshot *s) +{ + snapshotClose(s); + raft_free(s); +} + +int snapshotRestore(struct raft *r, struct raft_snapshot *snapshot) +{ + int rv; + + assert(snapshot->n_bufs == 1); + + rv = r->fsm->restore(r->fsm, &snapshot->bufs[0]); + if (rv != 0) { + tracef("restore snapshot %llu: %s", snapshot->index, + errCodeToString(rv)); + return rv; + } + + configurationClose(&r->configuration); + r->configuration = snapshot->configuration; + r->configuration_committed_index = snapshot->configuration_index; + r->configuration_uncommitted_index = 0; + + /* Make a copy of the configuration contained in the snapshot, in case + * r->configuration gets overriden with an uncommitted configuration and + * we then need to rollback, but the log does not contain anymore the + * entry at r->configuration_committed_index because it was truncated. + */ + configurationClose(&r->configuration_last_snapshot); + rv = configurationCopy(&r->configuration, + &r->configuration_last_snapshot); + if (rv != 0) { + return rv; + } + + configurationTrace(r, &r->configuration, + "configuration restore from snapshot"); + + r->commit_index = snapshot->index; + r->last_applied = snapshot->index; + r->last_stored = snapshot->index; + + /* Don't free the snapshot data buffer, as ownership has been + * transferred to the fsm. */ + raft_free(snapshot->bufs); + + return 0; +} + +int snapshotCopy(const struct raft_snapshot *src, struct raft_snapshot *dst) +{ + int rv; + unsigned i; + size_t size; + uint8_t *cursor; + + dst->term = src->term; + dst->index = src->index; + dst->configuration_index = src->configuration_index; + + rv = configurationCopy(&src->configuration, &dst->configuration); + if (rv != 0) { + return rv; + } + + size = 0; + for (i = 0; i < src->n_bufs; i++) { + size += src->bufs[i].len; + } + + dst->bufs = raft_malloc(sizeof *dst->bufs); + assert(dst->bufs != NULL); + + dst->bufs[0].base = raft_malloc(size); + dst->bufs[0].len = size; + if (dst->bufs[0].base == NULL) { + return RAFT_NOMEM; + } + + cursor = dst->bufs[0].base; + + for (i = 0; i < src->n_bufs; i++) { + memcpy(cursor, src->bufs[i].base, src->bufs[i].len); + cursor += src->bufs[i].len; + } + + dst->n_bufs = 1; + + return 0; +} + +#undef tracef diff --git a/src/raft/snapshot.h b/src/raft/snapshot.h new file mode 100644 index 000000000..90ab1b337 --- /dev/null +++ b/src/raft/snapshot.h @@ -0,0 +1,28 @@ +#ifndef RAFT_SNAPSHOT_H_ +#define RAFT_SNAPSHOT_H_ + +#include "../raft.h" + +/* Release all memory associated with the given snapshot. */ +void snapshotClose(struct raft_snapshot *s); + +/* Like snapshotClose(), but also release the snapshot object itself. */ +void snapshotDestroy(struct raft_snapshot *s); + +/* Restore a snapshot. + * + * This will reset the current state of the server as if the last entry + * contained in the snapshot had just been persisted, committed and applied. + * + * The in-memory log must be empty when calling this function. + * + * If no error occurs, the memory of the snapshot object gets released. */ +int snapshotRestore(struct raft *r, struct raft_snapshot *snapshot); + +/* Make a full deep copy of a snapshot object. + * + * All data buffers in the source snapshot will be compacted in a single buffer + * in the destination snapshot. */ +int snapshotCopy(const struct raft_snapshot *src, struct raft_snapshot *dst); + +#endif /* RAFT_SNAPSHOT_H */ diff --git a/src/raft/start.c b/src/raft/start.c new file mode 100644 index 000000000..023d51e74 --- /dev/null +++ b/src/raft/start.c @@ -0,0 +1,232 @@ +#include "../raft.h" +#include "../tracing.h" +#include "assert.h" +#include "configuration.h" +#include "convert.h" +#include "entry.h" +#include "err.h" +#include "log.h" +#include "recv.h" +#include "snapshot.h" +#include "tick.h" + +/* Restore the most recent configuration entry found in the log. */ +static int restoreMostRecentConfigurationEntry(struct raft *r, + struct raft_entry *entry, + raft_index index) +{ + struct raft_configuration configuration; + int rv; + + rv = configurationDecode(&entry->buf, &configuration); + if (rv != 0) { + configurationClose(&configuration); + return rv; + } + + configurationClose(&r->configuration); + r->configuration = configuration; + + /* If the configuration comes from entry at index 1 in the log, we know + * it's the bootstrap configuration and it's committed by default. + * Otherwise we we can't know if it's committed or not and treat it as + * uncommitted. */ + if (index == 1) { + assert(r->configuration_uncommitted_index == 0); + r->configuration_committed_index = 1; + } else { + assert(r->configuration_committed_index < index); + r->configuration_uncommitted_index = index; + } + + configurationTrace(r, &r->configuration, + "restore most recent configuration"); + return 0; +} + +/* Restore the entries that were loaded from persistent storage. The most recent + * configuration entry will be restored as well, if any. + * + * Note that if the last configuration entry in the log has index greater than + * one we cannot know if it is committed or not. Therefore we also need to track + * the second-to-last configuration entry. This second-to-last entry is + * committed by default as raft doesn't allow multiple uncommitted configuration + * entries. That entry is used in case of configuration rollback scenarios. If + * we don't find the second-to-last configuration entry in the log, it means + * that the log was truncated after a snapshot and second-to-last configuration + * is available in r->configuration_last_snapshot, which we popolated earlier + * when the snapshot was restored. */ +static int restoreEntries(struct raft *r, + raft_index snapshot_index, + raft_term snapshot_term, + raft_index start_index, + struct raft_entry *entries, + size_t n) +{ + struct raft_entry *conf = NULL; + raft_index conf_index = 0; + size_t i; + int rv; + logStart(r->log, snapshot_index, snapshot_term, start_index); + r->last_stored = start_index - 1; + for (i = 0; i < n; i++) { + struct raft_entry *entry = &entries[i]; + rv = logAppend(r->log, entry->term, entry->type, &entry->buf, + entry->batch); + if (rv != 0) { + goto err; + } + r->last_stored++; + + /* Only take into account configurations that are newer than the + * configuration restored from the snapshot. */ + if (entry->type == RAFT_CHANGE && + r->last_stored > r->configuration_committed_index) { + /* If there is a previous configuration it must have + * been committed as we don't allow multiple uncommitted + * configurations. At the end of the loop + * r->configuration_committed_index will point to the + * second to last configuration entry, if any. */ + if (conf_index != 0) { + r->configuration_committed_index = conf_index; + } + conf = entry; + conf_index = r->last_stored; + } + } + + if (conf != NULL) { + rv = restoreMostRecentConfigurationEntry(r, conf, conf_index); + if (rv != 0) { + goto err; + } + } + + raft_free(entries); + return 0; + +err: + if (logNumEntries(r->log) > 0) { + logDiscard(r->log, r->log->offset + 1); + } + return rv; +} + +/* If we're the only voting server in the configuration, automatically + * self-elect ourselves and convert to leader without waiting for the election + * timeout. */ +static int maybeSelfElect(struct raft *r) +{ + const struct raft_server *server; + int rv; + server = configurationGet(&r->configuration, r->id); + if (server == NULL || server->role != RAFT_VOTER || + configurationVoterCount(&r->configuration) > 1) { + return 0; + } + /* Converting to candidate will notice that we're the only voter and + * automatically convert to leader. */ + rv = convertToCandidate(r, false /* disrupt leader */); + if (rv != 0) { + return rv; + } + assert(r->state == RAFT_LEADER); + return 0; +} + +int raft_start(struct raft *r) +{ + struct raft_snapshot *snapshot; + raft_index snapshot_index = 0; + raft_term snapshot_term = 0; + raft_index start_index; + struct raft_entry *entries; + size_t n_entries; + int rv; + + assert(r != NULL); + assert(r->state == RAFT_UNAVAILABLE); + assert(r->heartbeat_timeout != 0); + assert(r->heartbeat_timeout < r->election_timeout); + assert(r->install_snapshot_timeout != 0); + assert(logNumEntries(r->log) == 0); + assert(logSnapshotIndex(r->log) == 0); + assert(r->last_stored == 0); + +#ifndef RAFT_REVISION +#define RAFT_REVISION "unknown" +#endif + tracef("starting version:%d revision:%s", raft_version_number(), + RAFT_REVISION); + rv = r->io->load(r->io, &r->current_term, &r->voted_for, &snapshot, + &start_index, &entries, &n_entries); + if (rv != 0) { + ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); + return rv; + } + assert(start_index >= 1); + tracef( + "current_term:%llu voted_for:%llu start_index:%llu n_entries:%zu", + r->current_term, r->voted_for, start_index, n_entries); + + /* If we have a snapshot, let's restore it. */ + if (snapshot != NULL) { + tracef( + "restore snapshot with last index %llu and last term %llu", + snapshot->index, snapshot->term); + rv = snapshotRestore(r, snapshot); + if (rv != 0) { + snapshotDestroy(snapshot); + entryBatchesDestroy(entries, n_entries); + return rv; + } + snapshot_index = snapshot->index; + snapshot_term = snapshot->term; + raft_free(snapshot); + } else if (n_entries > 0) { + /* If we don't have a snapshot and the on-disk log is not empty, + * then the first entry must be a configuration entry. */ + assert(start_index == 1); + assert(entries[0].type == RAFT_CHANGE); + + /* As a small optimization, bump the commit index to 1 since we + * require the first entry to be the same on all servers. */ + r->commit_index = 1; + r->last_applied = 1; + } + + /* Append the entries to the log, possibly restoring the last + * configuration. */ + tracef("restore %zu entries starting at %llu", n_entries, start_index); + rv = restoreEntries(r, snapshot_index, snapshot_term, start_index, + entries, n_entries); + if (rv != 0) { + entryBatchesDestroy(entries, n_entries); + return rv; + } + + /* Start the I/O backend. The tickCb function is expected to fire every + * r->heartbeat_timeout milliseconds and recvCb whenever an RPC is + * received. */ + rv = r->io->start(r->io, r->heartbeat_timeout, tickCb, recvCb); + if (rv != 0) { + tracef("io start failed %d", rv); + return rv; + } + + /* By default we start as followers. */ + convertToFollower(r); + + /* If there's only one voting server, and that is us, it's safe to + * convert to leader right away. If that is not us, we're either joining + * the cluster or we're simply configured as non-voter, and we'll stay + * follower. */ + rv = maybeSelfElect(r); + if (rv != 0) { + return rv; + } + + return 0; +} + +#undef tracef diff --git a/src/raft/state.c b/src/raft/state.c new file mode 100644 index 000000000..af46d76d9 --- /dev/null +++ b/src/raft/state.c @@ -0,0 +1,54 @@ +#include "assert.h" +#include "configuration.h" +#include "election.h" +#include "log.h" +#include "queue.h" + +int raft_state(struct raft *r) +{ + return r->state; +} + +void raft_leader(struct raft *r, raft_id *id, const char **address) +{ + switch (r->state) { + case RAFT_UNAVAILABLE: + case RAFT_CANDIDATE: + *id = 0; + *address = NULL; + return; + case RAFT_FOLLOWER: + *id = r->follower_state.current_leader.id; + *address = r->follower_state.current_leader.address; + return; + case RAFT_LEADER: + if (r->transfer != NULL) { + *id = 0; + *address = NULL; + return; + } + *id = r->id; + *address = r->address; + return; + } +} + +raft_index raft_last_index(struct raft *r) +{ + return logLastIndex(r->log); +} + +raft_index raft_last_applied(struct raft *r) +{ + return r->last_applied; +} + +int raft_role(struct raft *r) +{ + const struct raft_server *local = + configurationGet(&r->configuration, r->id); + if (local == NULL) { + return -1; + } + return local->role; +} diff --git a/src/raft/syscall.c b/src/raft/syscall.c new file mode 100644 index 000000000..12c4390a0 --- /dev/null +++ b/src/raft/syscall.c @@ -0,0 +1,58 @@ +#include "syscall.h" + +#if HAVE_LINUX_AIO_ABI_H || HAVE_LINUX_IO_URING_H +#include +#include +#endif + +#if HAVE_LINUX_AIO_ABI_H +int io_setup(unsigned nr_events, aio_context_t *ctx_idp) +{ + return (int)syscall(__NR_io_setup, nr_events, ctx_idp); +} + +int io_destroy(aio_context_t ctx_id) +{ + return (int)syscall(__NR_io_destroy, ctx_id); +} + +int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp) +{ + return (int)syscall(__NR_io_submit, ctx_id, nr, iocbpp); +} + +int io_getevents(aio_context_t ctx_id, + long min_nr, + long nr, + struct io_event *events, + struct timespec *timeout) +{ + return (int)syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, + timeout); +} +#endif + +#if HAVE_LINUX_IO_URING_H +int io_uring_register(int fd, + unsigned int opcode, + const void *arg, + unsigned int nr_args) +{ + return (int)syscall(__NR_io_uring_register, fd, opcode, arg, nr_args); +} + +int io_uring_setup(unsigned int entries, struct io_uring_params *p) +{ + return (int)syscall(__NR_io_uring_setup, entries, p); +} + +int io_uring_enter(int fd, + unsigned int to_submit, + unsigned int min_complete, + unsigned int flags, + sigset_t *sig) +{ + return (int)syscall(__NR_io_uring_enter, fd, to_submit, min_complete, + flags, sig, _NSIG / 8); +} +#endif diff --git a/src/raft/syscall.h b/src/raft/syscall.h new file mode 100644 index 000000000..c8459fffc --- /dev/null +++ b/src/raft/syscall.h @@ -0,0 +1,47 @@ +/* Wrappers for system calls not yet defined in libc. */ + +#ifndef SYSCALL_H_ +#define SYSCALL_H_ + +#if HAVE_LINUX_AIO_ABI_H +#include +#include +#include +#endif + +#if HAVE_LINUX_IO_URING_H +#include +#endif + +#if HAVE_LINUX_AIO_ABI_H +/* AIO */ +int io_setup(unsigned nr_events, aio_context_t *ctx_idp); + +int io_destroy(aio_context_t ctx_id); + +int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp); + +int io_getevents(aio_context_t ctx_id, + long min_nr, + long nr, + struct io_event *events, + struct timespec *timeout); +#endif + +#if HAVE_LINUX_IO_URING_H +/* uring */ +int io_uring_register(int fd, + unsigned int opcode, + const void *arg, + unsigned int nr_args); + +int io_uring_setup(unsigned int entries, struct io_uring_params *p); + +int io_uring_enter(int fd, + unsigned int to_submit, + unsigned int min_complete, + unsigned int flags, + sigset_t *sig); +#endif + +#endif /* SYSCALL_ */ diff --git a/src/raft/tick.c b/src/raft/tick.c new file mode 100644 index 000000000..f6dd407c7 --- /dev/null +++ b/src/raft/tick.c @@ -0,0 +1,259 @@ +#include "../raft.h" +#include "../tracing.h" +#include "assert.h" +#include "configuration.h" +#include "convert.h" +#include "election.h" +#include "membership.h" +#include "progress.h" +#include "replication.h" + +/* Apply time-dependent rules for followers (Figure 3.1). */ +static int tickFollower(struct raft *r) +{ + const struct raft_server *server; + int rv; + + assert(r != NULL); + assert(r->state == RAFT_FOLLOWER); + + server = configurationGet(&r->configuration, r->id); + + /* If we have been removed from the configuration, or maybe we didn't + * receive one yet, just stay follower. */ + if (server == NULL) { + return 0; + } + + /* Check if we need to start an election. + * + * From Section 3.3: + * + * If a follower receives no communication over a period of time + * called the election timeout, then it assumes there is no viable + * leader and begins an election to choose a new leader. + * + * Figure 3.1: + * + * If election timeout elapses without receiving AppendEntries RPC + * from current leader or granting vote to candidate, convert to + * candidate. + */ + if (electionTimerExpired(r) && server->role == RAFT_VOTER) { + if (replicationInstallSnapshotBusy(r)) { + tracef( + "installing snapshot -> don't convert to " + "candidate"); + electionResetTimer(r); + return 0; + } + if (r->follower_state.append_in_flight_count > 0) { + tracef( + "append in progress -> don't convert to candidate"); + electionResetTimer(r); + return 0; + } + tracef("convert to candidate and start new election"); + rv = convertToCandidate(r, false /* disrupt leader */); + if (rv != 0) { + tracef("convert to candidate: %s", raft_strerror(rv)); + return rv; + } + } + + return 0; +} + +/* Apply time-dependent rules for candidates (Figure 3.1). */ +static int tickCandidate(struct raft *r) +{ + assert(r != NULL); + assert(r->state == RAFT_CANDIDATE); + + /* Check if we need to start an election. + * + * From Section 3.4: + * + * The third possible outcome is that a candidate neither wins nor + * loses the election: if many followers become candidates at the same + * time, votes could be split so that no candidate obtains a majority. + * When this happens, each candidate will time out and start a new + * election by incrementing its term and initiating another round of + * RequestVote RPCs + */ + if (electionTimerExpired(r)) { + tracef("start new election"); + return electionStart(r); + } + + return 0; +} + +/* Return true if we received an AppendEntries RPC result from a majority of + * voting servers since we became leaders or since the last time this function + * was called. + * + * For each server the function checks the recent_recv flag of the associated + * progress object, and resets the flag after the check. It returns true if a + * majority of voting server had the flag set to true. */ +static bool checkContactQuorum(struct raft *r) +{ + unsigned i; + unsigned contacts = 0; + assert(r->state == RAFT_LEADER); + + for (i = 0; i < r->configuration.n; i++) { + struct raft_server *server = &r->configuration.servers[i]; + bool recent_recv = progressResetRecentRecv(r, i); + if ((server->role == RAFT_VOTER && recent_recv) || + server->id == r->id) { + contacts++; + } + } + r->leader_state.voter_contacts = contacts; + + return contacts > configurationVoterCount(&r->configuration) / 2; +} + +/* Apply time-dependent rules for leaders (Figure 3.1). */ +static int tickLeader(struct raft *r) +{ + raft_time now = r->io->time(r->io); + assert(r->state == RAFT_LEADER); + + /* Check if we still can reach a majority of servers. + * + * From Section 6.2: + * + * A leader in Raft steps down if an election timeout elapses without + * a successful round of heartbeats to a majority of its cluster; this + * allows clients to retry their requests with another server. + */ + if (now - r->election_timer_start >= r->election_timeout) { + if (!checkContactQuorum(r)) { + tracef( + "unable to contact majority of cluster -> step " + "down"); + convertToFollower(r); + return 0; + } + r->election_timer_start = r->io->time(r->io); + } + + /* Possibly send heartbeats. + * + * From Figure 3.1: + * + * Send empty AppendEntries RPC during idle periods to prevent + * election timeouts. + */ + replicationHeartbeat(r); + + /* If a server is being promoted, increment the timer of the current + * round or abort the promotion. + * + * From Section 4.2.1: + * + * The algorithm waits a fixed number of rounds (such as 10). If the + * last round lasts less than an election timeout, then the leader adds + * the new server to the cluster, under the assumption that there are + * not enough unreplicated entries to create a significant availability + * gap. Otherwise, the leader aborts the configuration change with an + * error. + */ + if (r->leader_state.promotee_id != 0) { + raft_id id = r->leader_state.promotee_id; + unsigned server_index; + raft_time round_duration = now - r->leader_state.round_start; + bool is_too_slow; + bool is_unresponsive; + + /* If a promotion is in progress, we expect that our + * configuration contains an entry for the server being + * promoted, and that the server is not yet considered as + * voting. */ + server_index = configurationIndexOf(&r->configuration, id); + assert(server_index < r->configuration.n); + assert(r->configuration.servers[server_index].role != + RAFT_VOTER); + + is_too_slow = + (r->leader_state.round_number == r->max_catch_up_rounds && + round_duration > r->election_timeout); + is_unresponsive = + round_duration > r->max_catch_up_round_duration; + + /* Abort the promotion if we are at the 10'th round and it's + * still taking too long, or if the server is unresponsive. */ + if (is_too_slow || is_unresponsive) { + tracef( + "server_index:%d is_too_slow:%d is_unresponsive:%d", + server_index, is_too_slow, is_unresponsive); + struct raft_change *change; + + r->leader_state.promotee_id = 0; + + r->leader_state.round_index = 0; + r->leader_state.round_number = 0; + r->leader_state.round_start = 0; + + change = r->leader_state.change; + r->leader_state.change = NULL; + if (change != NULL && change->cb != NULL) { + change->cb(change, RAFT_NOCONNECTION); + } + } + } + + return 0; +} + +static int tick(struct raft *r) +{ + int rv = -1; + + assert(r->state == RAFT_UNAVAILABLE || r->state == RAFT_FOLLOWER || + r->state == RAFT_CANDIDATE || r->state == RAFT_LEADER); + + /* If we are not available, let's do nothing. */ + if (r->state == RAFT_UNAVAILABLE) { + return 0; + } + + switch (r->state) { + case RAFT_FOLLOWER: + rv = tickFollower(r); + break; + case RAFT_CANDIDATE: + rv = tickCandidate(r); + break; + case RAFT_LEADER: + rv = tickLeader(r); + break; + } + + return rv; +} + +void tickCb(struct raft_io *io) +{ + struct raft *r; + int rv; + r = io->data; + rv = tick(r); + if (rv != 0) { + convertToUnavailable(r); + return; + } + + /* For all states: if there is a leadership transfer request in + * progress, check if it's expired. */ + if (r->transfer != NULL) { + raft_time now = r->io->time(r->io); + if (now - r->transfer->start >= r->election_timeout) { + membershipLeadershipTransferClose(r); + } + } +} + +#undef tracef diff --git a/src/raft/tick.h b/src/raft/tick.h new file mode 100644 index 000000000..ad8751aee --- /dev/null +++ b/src/raft/tick.h @@ -0,0 +1,12 @@ +/* Logic to be invoked periodically. */ + +#ifndef TICK_H_ +#define TICK_H_ + +#include "../raft.h" + +/* Callback to be passed to the @raft_io implementation. It notifies us that a + * certain amount of time has elapsed and will be invoked periodically. */ +void tickCb(struct raft_io *io); + +#endif /* TICK_H_ */ diff --git a/src/raft/utils.h b/src/raft/utils.h new file mode 100644 index 000000000..d01688c87 --- /dev/null +++ b/src/raft/utils.h @@ -0,0 +1,17 @@ +#ifndef RAFT_UTILS_H_ +#define RAFT_UTILS_H_ + +#include + +/* Various utility functions and macros */ + +#define LIKELY(x) __builtin_expect(!!(x), 1) +#define UNLIKELY(x) __builtin_expect(!!(x), 0) + +#define DBG() fprintf(stderr, "%s:%d\n", __func__, __LINE__) + +#define UNUSED __attribute__((unused)) + +#define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof(a)[0])) + +#endif /* RAFT_UTILS_H_ */ diff --git a/src/raft/uv.c b/src/raft/uv.c new file mode 100644 index 000000000..c0602c5d3 --- /dev/null +++ b/src/raft/uv.c @@ -0,0 +1,815 @@ +#include "../raft.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../raft.h" +#include "../tracing.h" +#include "assert.h" +#include "byte.h" +#include "configuration.h" +#include "entry.h" +#include "heap.h" +#include "snapshot.h" +#include "uv.h" +#include "uv_encoding.h" +#include "uv_os.h" + +/* Retry to connect to peer servers every second. + * + * TODO: implement an exponential backoff instead. */ +#define CONNECT_RETRY_DELAY 1000 + +/* Cleans up files that are no longer used by the system */ +static int uvMaintenance(const char *dir, char *errmsg) +{ + struct uv_fs_s req; + struct uv_dirent_s entry; + int n; + int i; + int rv; + int rv2; + + n = uv_fs_scandir(NULL, &req, dir, 0, NULL); + if (n < 0) { + ErrMsgPrintf(errmsg, "scan data directory: %s", uv_strerror(n)); + return RAFT_IOERR; + } + + rv = 0; + for (i = 0; i < n; i++) { + const char *filename; + rv = uv_fs_scandir_next(&req, &entry); + assert(rv == 0); /* Can't fail in libuv */ + + filename = entry.name; + /* Remove leftover tmp-files */ + if (strncmp(filename, TMP_FILE_PREFIX, + strlen(TMP_FILE_PREFIX)) == 0) { + UvFsRemoveFile(dir, filename, + errmsg); /* Ignore errors */ + continue; + } + + /* Remove orphaned snapshot files */ + bool orphan = false; + if ((UvSnapshotIsOrphan(dir, filename, &orphan) == 0) && + orphan) { + UvFsRemoveFile(dir, filename, + errmsg); /* Ignore errors */ + continue; + } + + /* Remove orphaned snapshot metadata files */ + if ((UvSnapshotMetaIsOrphan(dir, filename, &orphan) == 0) && + orphan) { + UvFsRemoveFile(dir, filename, + errmsg); /* Ignore errors */ + } + } + + rv2 = uv_fs_scandir_next(&req, &entry); + assert(rv2 == UV_EOF); + return rv; +} + +/* Implementation of raft_io->config. */ +static int uvInit(struct raft_io *io, raft_id id, const char *address) +{ + struct uv *uv; + size_t direct_io; + struct uvMetadata metadata; + int rv; + uv = io->impl; + uv->id = id; + + rv = UvFsCheckDir(uv->dir, io->errmsg); + if (rv != 0) { + return rv; + } + + /* Probe file system capabilities */ + rv = UvFsProbeCapabilities(uv->dir, &direct_io, &uv->async_io, + &uv->fallocate, io->errmsg); + if (rv != 0) { + return rv; + } + uv->direct_io = direct_io != 0; + uv->block_size = direct_io != 0 ? direct_io : 4096; + + rv = uvMaintenance(uv->dir, io->errmsg); + if (rv != 0) { + return rv; + } + + rv = uvMetadataLoad(uv->dir, &metadata, io->errmsg); + if (rv != 0) { + return rv; + } + uv->metadata = metadata; + + rv = uv->transport->init(uv->transport, id, address); + if (rv != 0) { + ErrMsgTransfer(uv->transport->errmsg, io->errmsg, "transport"); + return rv; + } + uv->transport->data = uv; + + rv = uv_timer_init(uv->loop, &uv->timer); + assert(rv == 0); /* This should never fail */ + uv->timer.data = uv; + + return 0; +} + +/* Periodic timer callback */ +static void uvTickTimerCb(uv_timer_t *timer) +{ + struct uv *uv; + uv = timer->data; + if (uv->tick_cb != NULL) { + uv->tick_cb(uv->io); + } +} + +/* Implementation of raft_io->start. */ +static int uvStart(struct raft_io *io, + unsigned msecs, + raft_io_tick_cb tick_cb, + raft_io_recv_cb recv_cb) +{ + struct uv *uv; + int rv; + uv = io->impl; + uv->state = UV__ACTIVE; + uv->tick_cb = tick_cb; + uv->recv_cb = recv_cb; + rv = UvRecvStart(uv); + if (rv != 0) { + return rv; + } + rv = uv_timer_start(&uv->timer, uvTickTimerCb, msecs, msecs); + assert(rv == 0); + return 0; +} + +void uvMaybeFireCloseCb(struct uv *uv) +{ + tracef("uv maybe fire close cb"); + if (!uv->closing) { + return; + } + + if (uv->transport->data != NULL) { + return; + } + if (uv->timer.data != NULL) { + return; + } + if (!QUEUE_IS_EMPTY(&uv->append_segments)) { + return; + } + if (!QUEUE_IS_EMPTY(&uv->finalize_reqs)) { + return; + } + if (uv->finalize_work.data != NULL) { + return; + } + if (uv->prepare_inflight != NULL) { + return; + } + if (uv->barrier != NULL) { + return; + } + if (uv->snapshot_put_work.data != NULL) { + return; + } + if (!QUEUE_IS_EMPTY(&uv->snapshot_get_reqs)) { + return; + } + if (!QUEUE_IS_EMPTY(&uv->async_work_reqs)) { + return; + } + if (!QUEUE_IS_EMPTY(&uv->aborting)) { + return; + } + + assert(uv->truncate_work.data == NULL); + + if (uv->close_cb != NULL) { + uv->close_cb(uv->io); + } +} + +static void uvTickTimerCloseCb(uv_handle_t *handle) +{ + struct uv *uv = handle->data; + assert(uv->closing); + uv->timer.data = NULL; + uvMaybeFireCloseCb(uv); +} + +static void uvTransportCloseCb(struct raft_uv_transport *transport) +{ + struct uv *uv = transport->data; + assert(uv->closing); + uv->transport->data = NULL; + uvMaybeFireCloseCb(uv); +} + +/* Implementation of raft_io->close. */ +static void uvClose(struct raft_io *io, raft_io_close_cb cb) +{ + struct uv *uv; + uv = io->impl; + assert(uv != NULL); + assert(!uv->closing); + uv->close_cb = cb; + uv->closing = true; + UvSendClose(uv); + UvRecvClose(uv); + uvAppendClose(uv); + if (uv->transport->data != NULL) { + uv->transport->close(uv->transport, uvTransportCloseCb); + } + if (uv->timer.data != NULL) { + uv_close((uv_handle_t *)&uv->timer, uvTickTimerCloseCb); + } + uvMaybeFireCloseCb(uv); +} + +/* Filter the given segment list to find the most recent contiguous chunk of + * closed segments that overlaps with the given snapshot last index. */ +static int uvFilterSegments(struct uv *uv, + raft_index last_index, + const char *snapshot_filename, + struct uvSegmentInfo **segments, + size_t *n) +{ + struct uvSegmentInfo *segment; + size_t i; /* First valid closed segment. */ + size_t j; /* Last valid closed segment. */ + + /* If there are not segments at all, or only open segments, there's + * nothing to do. */ + if (*segments == NULL || (*segments)[0].is_open) { + return 0; + } + + /* Find the index of the most recent closed segment. */ + for (j = 0; j < *n; j++) { + segment = &(*segments)[j]; + if (segment->is_open) { + break; + } + } + assert(j > 0); + j--; + + segment = &(*segments)[j]; + tracef("most recent closed segment is %s", segment->filename); + + /* If the end index of the last closed segment is lower than the last + * snapshot index, there might be no entry that we can keep. We return + * an empty segment list, unless there is at least one open segment, in + * that case we keep everything hoping that they contain all the entries + * since the last closed segment (TODO: we should encode the starting + * entry in the open segment). */ + if (segment->end_index < last_index) { + if (!(*segments)[*n - 1].is_open) { + tracef( + "discarding all closed segments, since most recent " + "is behind " + "last snapshot"); + raft_free(*segments); + *segments = NULL; + *n = 0; + return 0; + } + tracef( + "most recent closed segment %s is behind last snapshot, " + "yet there are open segments", + segment->filename); + } + + /* Now scan the segments backwards, searching for the longest list of + * contiguous closed segments. */ + if (j >= 1) { + for (i = j; i > 0; i--) { + struct uvSegmentInfo *newer; + struct uvSegmentInfo *older; + newer = &(*segments)[i]; + older = &(*segments)[i - 1]; + if (older->end_index != newer->first_index - 1) { + tracef("discarding non contiguous segment %s", + older->filename); + break; + } + } + } else { + i = j; + } + + /* Make sure that the first index of the first valid closed segment is + * not greater than the snapshot's last index plus one (so there are no + * missing entries). */ + segment = &(*segments)[i]; + if (segment->first_index > last_index + 1) { + ErrMsgPrintf(uv->io->errmsg, + "closed segment %s is past last snapshot %s", + segment->filename, snapshot_filename); + return RAFT_CORRUPT; + } + + if (i != 0) { + size_t new_n = *n - i; + struct uvSegmentInfo *new_segments; + new_segments = raft_malloc(new_n * sizeof *new_segments); + if (new_segments == NULL) { + return RAFT_NOMEM; + } + memcpy(new_segments, &(*segments)[i], + new_n * sizeof *new_segments); + raft_free(*segments); + *segments = new_segments; + *n = new_n; + } + + return 0; +} + +/* Load the last snapshot (if any) and all entries contained in all segment + * files of the data directory. This function can be called recursively, `depth` + * is there to ensure we don't get stuck in a recursive loop. */ +static int uvLoadSnapshotAndEntries(struct uv *uv, + struct raft_snapshot **snapshot, + raft_index *start_index, + struct raft_entry *entries[], + size_t *n, + int depth) +{ + struct uvSnapshotInfo *snapshots; + struct uvSegmentInfo *segments; + size_t n_snapshots; + size_t n_segments; + int rv; + + *snapshot = NULL; + *start_index = 1; + *entries = NULL; + *n = 0; + + /* List available snapshots and segments. */ + rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments, + uv->io->errmsg); + if (rv != 0) { + goto err; + } + + /* Load the most recent snapshot, if any. */ + if (snapshots != NULL) { + char snapshot_filename[UV__FILENAME_LEN]; + *snapshot = RaftHeapMalloc(sizeof **snapshot); + if (*snapshot == NULL) { + rv = RAFT_NOMEM; + goto err; + } + rv = UvSnapshotLoad(uv, &snapshots[n_snapshots - 1], *snapshot, + uv->io->errmsg); + if (rv != 0) { + RaftHeapFree(*snapshot); + *snapshot = NULL; + goto err; + } + uvSnapshotFilenameOf(&snapshots[n_snapshots - 1], + snapshot_filename); + tracef("most recent snapshot at %lld", (*snapshot)->index); + RaftHeapFree(snapshots); + snapshots = NULL; + + /* Update the start index. If there are closed segments on disk + * let's make sure that the first index of the first closed + * segment is not greater than the snapshot's last index plus + * one (so there are no missing entries), and update the start + * index accordingly. */ + rv = uvFilterSegments(uv, (*snapshot)->index, snapshot_filename, + &segments, &n_segments); + if (rv != 0) { + goto err; + } + if (segments != NULL) { + if (segments[0].is_open) { + *start_index = (*snapshot)->index + 1; + } else { + *start_index = segments[0].first_index; + } + } else { + *start_index = (*snapshot)->index + 1; + } + } + + /* Read data from segments, closing any open segments. */ + if (segments != NULL) { + raft_index last_index; + rv = uvSegmentLoadAll(uv, *start_index, segments, n_segments, + entries, n); + if (rv != 0) { + goto err; + } + + /* Check if all entries that we loaded are actually behind the + * last snapshot. This can happen if the last closed segment was + * behind the last snapshot and there were open segments, but + * the entries in the open segments turned out to be behind the + * snapshot as well. */ + last_index = *start_index + *n - 1; + if (*snapshot != NULL && last_index < (*snapshot)->index) { + ErrMsgPrintf(uv->io->errmsg, + "last entry on disk has index %llu, which " + "is behind " + "last snapshot's index %llu", + last_index, (*snapshot)->index); + rv = RAFT_CORRUPT; + goto err; + } + + raft_free(segments); + segments = NULL; + } + + return 0; + +err: + assert(rv != 0); + if (*snapshot != NULL) { + snapshotDestroy(*snapshot); + *snapshot = NULL; + } + if (snapshots != NULL) { + raft_free(snapshots); + } + if (segments != NULL) { + raft_free(segments); + } + if (*entries != NULL) { + entryBatchesDestroy(*entries, *n); + *entries = NULL; + *n = 0; + } + /* Try to recover exactly once when corruption is detected, the first + * pass might have cleaned up corrupt data. Most of the arguments are + * already reset after the `err` label, except for `start_index`. */ + if (rv == RAFT_CORRUPT && uv->auto_recovery && depth == 0) { + *start_index = 1; + return uvLoadSnapshotAndEntries(uv, snapshot, start_index, + entries, n, depth + 1); + } + return rv; +} + +/* Implementation of raft_io->load. */ +static int uvLoad(struct raft_io *io, + raft_term *term, + raft_id *voted_for, + struct raft_snapshot **snapshot, + raft_index *start_index, + struct raft_entry **entries, + size_t *n_entries) +{ + struct uv *uv; + int rv; + uv = io->impl; + + *term = uv->metadata.term; + *voted_for = uv->metadata.voted_for; + *snapshot = NULL; + + rv = uvLoadSnapshotAndEntries(uv, snapshot, start_index, entries, + n_entries, 0); + if (rv != 0) { + return rv; + } + tracef("start index %lld, %zu entries", *start_index, *n_entries); + if (*snapshot == NULL) { + tracef("no snapshot"); + } + + /* Set the index of the next entry that will be appended. */ + uv->append_next_index = *start_index + *n_entries; + + return 0; +} + +/* Implementation of raft_io->set_term. */ +static int uvSetTerm(struct raft_io *io, const raft_term term) +{ + struct uv *uv; + int rv; + uv = io->impl; + uv->metadata.version++; + uv->metadata.term = term; + uv->metadata.voted_for = 0; + rv = uvMetadataStore(uv, &uv->metadata); + if (rv != 0) { + return rv; + } + return 0; +} + +/* Implementation of raft_io->set_term. */ +static int uvSetVote(struct raft_io *io, const raft_id server_id) +{ + struct uv *uv; + int rv; + uv = io->impl; + uv->metadata.version++; + uv->metadata.voted_for = server_id; + rv = uvMetadataStore(uv, &uv->metadata); + if (rv != 0) { + return rv; + } + return 0; +} + +/* Implementation of raft_io->bootstrap. */ +static int uvBootstrap(struct raft_io *io, + const struct raft_configuration *configuration) +{ + struct uv *uv; + int rv; + uv = io->impl; + + /* We shouldn't have written anything else yet. */ + if (uv->metadata.term != 0) { + ErrMsgPrintf(io->errmsg, "metadata contains term %lld", + uv->metadata.term); + return RAFT_CANTBOOTSTRAP; + } + + /* Write the term */ + rv = uvSetTerm(io, 1); + if (rv != 0) { + return rv; + } + + /* Create the first closed segment file, containing just one entry. */ + rv = uvSegmentCreateFirstClosed(uv, configuration); + if (rv != 0) { + return rv; + } + + return 0; +} + +/* Implementation of raft_io->recover. */ +static int uvRecover(struct raft_io *io, const struct raft_configuration *conf) +{ + struct uv *uv = io->impl; + struct raft_snapshot *snapshot; + raft_index start_index; + raft_index next_index; + struct raft_entry *entries; + size_t n_entries; + int rv; + + /* Load the current state. This also closes any leftover open segment. + */ + rv = uvLoadSnapshotAndEntries(uv, &snapshot, &start_index, &entries, + &n_entries, 0); + if (rv != 0) { + return rv; + } + + /* We don't care about the actual data, just index of the last entry. */ + if (snapshot != NULL) { + snapshotDestroy(snapshot); + } + if (entries != NULL) { + entryBatchesDestroy(entries, n_entries); + } + + assert(start_index > 0); + next_index = start_index + n_entries; + + rv = uvSegmentCreateClosedWithConfiguration(uv, next_index, conf); + if (rv != 0) { + return rv; + } + + return 0; +} + +/* Implementation of raft_io->time. */ +static raft_time uvTime(struct raft_io *io) +{ + struct uv *uv; + uv = io->impl; + return uv_now(uv->loop); +} + +/* Implementation of raft_io->random. */ +static int uvRandom(struct raft_io *io, int min, int max) +{ + (void)io; + return min + (abs(rand()) % (max - min)); +} + +static void uvSeedRand(struct uv *uv) +{ + ssize_t sz = -1; + unsigned seed = 0; /* fed to srand() */ + + sz = getrandom(&seed, sizeof seed, GRND_NONBLOCK); + if (sz == -1 || sz < ((ssize_t)sizeof seed)) { + /* Fall back to an inferior random seed when `getrandom` would + * have blocked or when not enough randomness was returned. */ + seed ^= (unsigned)uv->id; + seed ^= (unsigned)uv_now(uv->loop); + struct timeval time = {0}; + /* Ignore errors. */ + gettimeofday(&time, NULL); + seed ^= + (unsigned)((time.tv_sec * 1000) + (time.tv_usec / 1000)); + } + + srand(seed); +} + +int raft_uv_init(struct raft_io *io, + struct uv_loop_s *loop, + const char *dir, + struct raft_uv_transport *transport) +{ + struct uv *uv; + void *data; + int rv; + + assert(io != NULL); + assert(loop != NULL); + assert(dir != NULL); + assert(transport != NULL); + + data = io->data; + memset(io, 0, sizeof *io); + io->data = data; + + if (transport->version == 0) { + ErrMsgPrintf(io->errmsg, "transport->version must be set"); + return RAFT_INVALID; + } + + /* Ensure that the given path doesn't exceed our static buffer limit. */ + if (!UV__DIR_HAS_VALID_LEN(dir)) { + ErrMsgPrintf(io->errmsg, "directory path too long"); + return RAFT_NAMETOOLONG; + } + + /* Allocate the raft_io_uv object */ + uv = raft_malloc(sizeof *uv); + if (uv == NULL) { + rv = RAFT_NOMEM; + goto err; + } + memset(uv, 0, sizeof(struct uv)); + + uv->io = io; + uv->loop = loop; + strncpy(uv->dir, dir, sizeof(uv->dir) - 1); + uv->dir[sizeof(uv->dir) - 1] = '\0'; + uv->transport = transport; + uv->transport->data = NULL; + uv->tracer = NULL; + uv->id = 0; /* Set by raft_io->config() */ + uv->state = UV__PRISTINE; + uv->errored = false; + uv->direct_io = false; + uv->async_io = false; + uv->fallocate = false; +#ifdef LZ4_ENABLED + uv->snapshot_compression = true; +#else + uv->snapshot_compression = false; +#endif + uv->segment_size = UV__MAX_SEGMENT_SIZE; + uv->block_size = 0; + QUEUE_INIT(&uv->clients); + QUEUE_INIT(&uv->servers); + uv->connect_retry_delay = CONNECT_RETRY_DELAY; + uv->prepare_inflight = NULL; + QUEUE_INIT(&uv->prepare_reqs); + QUEUE_INIT(&uv->prepare_pool); + uv->prepare_next_counter = 1; + uv->append_next_index = 1; + QUEUE_INIT(&uv->append_segments); + QUEUE_INIT(&uv->append_pending_reqs); + QUEUE_INIT(&uv->append_writing_reqs); + uv->barrier = NULL; + QUEUE_INIT(&uv->finalize_reqs); + uv->finalize_work.data = NULL; + uv->truncate_work.data = NULL; + QUEUE_INIT(&uv->snapshot_get_reqs); + QUEUE_INIT(&uv->async_work_reqs); + uv->snapshot_put_work.data = NULL; + uv->timer.data = NULL; + uv->tick_cb = NULL; /* Set by raft_io->start() */ + uv->recv_cb = NULL; /* Set by raft_io->start() */ + QUEUE_INIT(&uv->aborting); + uv->closing = false; + uv->close_cb = NULL; + uv->auto_recovery = true; + + uvSeedRand(uv); + + /* Set the raft_io implementation. */ + io->version = 2; /* future-proof'ing */ + io->impl = uv; + io->init = uvInit; + io->close = uvClose; + io->start = uvStart; + io->load = uvLoad; + io->bootstrap = uvBootstrap; + io->recover = uvRecover; + io->set_term = uvSetTerm; + io->set_vote = uvSetVote; + io->append = UvAppend; + io->truncate = UvTruncate; + io->send = UvSend; + io->snapshot_put = UvSnapshotPut; + io->snapshot_get = UvSnapshotGet; + io->async_work = UvAsyncWork; + io->time = uvTime; + io->random = uvRandom; + + return 0; + +err: + assert(rv != 0); + if (rv == RAFT_NOMEM) { + ErrMsgOom(io->errmsg); + } + return rv; +} + +void raft_uv_close(struct raft_io *io) +{ + struct uv *uv; + uv = io->impl; + io->impl = NULL; + raft_free(uv); +} + +void raft_uv_set_segment_size(struct raft_io *io, size_t size) +{ + struct uv *uv; + uv = io->impl; + uv->segment_size = size; +} + +void raft_uv_set_block_size(struct raft_io *io, size_t size) +{ + struct uv *uv; + uv = io->impl; + uv->block_size = size; +} + +int raft_uv_set_snapshot_compression(struct raft_io *io, bool compressed) +{ + struct uv *uv; + uv = io->impl; +#ifndef LZ4_AVAILABLE + if (compressed) { + return RAFT_INVALID; + } +#endif + uv->snapshot_compression = compressed; + return 0; +} + +void raft_uv_set_connect_retry_delay(struct raft_io *io, unsigned msecs) +{ + struct uv *uv; + uv = io->impl; + uv->connect_retry_delay = msecs; +} + +void raft_uv_set_tracer(struct raft_io *io, struct raft_tracer *tracer) +{ + struct uv *uv; + uv = io->impl; + uv->tracer = tracer; +} + +void raft_uv_set_auto_recovery(struct raft_io *io, bool flag) +{ + struct uv *uv; + uv = io->impl; + uv->auto_recovery = flag; +} + +#undef tracef diff --git a/src/raft/uv.h b/src/raft/uv.h new file mode 100644 index 000000000..db4009c64 --- /dev/null +++ b/src/raft/uv.h @@ -0,0 +1,422 @@ +/* Implementation of the @raft_io interface based on libuv. */ + +#ifndef UV_H_ +#define UV_H_ + +#include "../raft.h" +#include "../tracing.h" +#include "err.h" +#include "queue.h" +#include "uv_fs.h" +#include "uv_os.h" + +/* 8 Megabytes */ +#define UV__MAX_SEGMENT_SIZE (8 * 1024 * 1024) + +/* Template string for closed segment filenames: start index (inclusive), end + * index (inclusive). */ +#define UV__CLOSED_TEMPLATE "%016llu-%016llu" + +/* Template string for open segment filenames: incrementing counter. */ +#define UV__OPEN_TEMPLATE "open-%llu" + +/* Enough to hold a segment filename (either open or closed) */ +#define UV__SEGMENT_FILENAME_BUF_SIZE 34 + +/* Template string for snapshot filenames: snapshot term, snapshot index, + * creation timestamp (milliseconds since epoch). */ +#define UV__SNAPSHOT_TEMPLATE "snapshot-%llu-%llu-%llu" + +#define UV__SNAPSHOT_META_SUFFIX ".meta" + +/* Template string for snapshot metadata filenames: snapshot term, snapshot + * index, creation timestamp (milliseconds since epoch). */ +#define UV__SNAPSHOT_META_TEMPLATE \ + UV__SNAPSHOT_TEMPLATE UV__SNAPSHOT_META_SUFFIX + +/* State codes. */ +enum { + UV__PRISTINE, /* Metadata cache populated and I/O capabilities probed */ + UV__ACTIVE, + UV__CLOSED +}; + +/* Open segment counter type */ +typedef unsigned long long uvCounter; + +/* Information persisted in a single metadata file. */ +struct uvMetadata +{ + unsigned long long version; /* Monotonically increasing version */ + raft_term term; /* Current term */ + raft_id voted_for; /* Server ID of last vote, or 0 */ +}; + +/* Hold state of a libuv-based raft_io implementation. */ +struct uv +{ + struct raft_io *io; /* I/O object we're implementing */ + struct uv_loop_s *loop; /* UV event loop */ + char dir[UV__DIR_LEN]; /* Data directory */ + struct raft_uv_transport *transport; /* Network transport */ + struct raft_tracer *tracer; /* Debug tracing */ + raft_id id; /* Server ID */ + int state; /* Current state */ + bool snapshot_compression; /* If compression is enabled */ + bool errored; /* If a disk I/O error was hit */ + bool direct_io; /* Whether direct I/O is supported */ + bool async_io; /* Whether async I/O is supported */ + bool fallocate; /* Whether fallocate is supported */ + size_t segment_size; /* Initial size of open segments. */ + size_t block_size; /* Block size of the data dir */ + queue clients; /* Outbound connections */ + queue servers; /* Inbound connections */ + unsigned connect_retry_delay; /* Client connection retry delay */ + void *prepare_inflight; /* Segment being prepared */ + queue prepare_reqs; /* Pending prepare requests. */ + queue prepare_pool; /* Prepared open segments */ + uvCounter prepare_next_counter; /* Counter of next open segment */ + raft_index append_next_index; /* Index of next entry to append */ + queue append_segments; /* Open segments in use. */ + queue append_pending_reqs; /* Pending append requests. */ + queue append_writing_reqs; /* Append requests in flight */ + struct UvBarrier *barrier; /* Inflight barrier request */ + queue finalize_reqs; /* Segments waiting to be closed */ + struct uv_work_s finalize_work; /* Resize and rename segments */ + struct uv_work_s truncate_work; /* Execute truncate log requests */ + queue snapshot_get_reqs; /* Inflight get snapshot requests */ + queue async_work_reqs; /* Inflight async work requests */ + struct uv_work_s snapshot_put_work; /* Execute snapshot put requests */ + struct uvMetadata metadata; /* Cache of metadata on disk */ + struct uv_timer_s timer; /* Timer for periodic ticks */ + raft_io_tick_cb tick_cb; /* Invoked when the timer expires */ + raft_io_recv_cb recv_cb; /* Invoked when upon RPC messages */ + queue aborting; /* Cleanups upon errors or shutdown */ + bool closing; /* True if we are closing */ + raft_io_close_cb close_cb; /* Invoked when finishing closing */ + bool auto_recovery; /* Try to recover from corrupt segments */ +}; + +/* Implementation of raft_io->truncate. */ +int UvTruncate(struct raft_io *io, raft_index index); + +/* Load Raft metadata from disk, choosing the most recent version (either the + * metadata1 or metadata2 file). */ +int uvMetadataLoad(const char *dir, struct uvMetadata *metadata, char *errmsg); + +/* Store the given metadata to disk, writing the appropriate metadata file + * according to the metadata version (if the version is odd, write metadata1, + * otherwise write metadata2). */ +int uvMetadataStore(struct uv *uv, const struct uvMetadata *metadata); + +/* Metadata about a segment file. */ +struct uvSegmentInfo +{ + bool is_open; /* Whether the segment is open */ + union { + struct + { + raft_index + first_index; /* First index in a closed segment */ + raft_index + end_index; /* Last index in a closed segment */ + }; + struct + { + unsigned long long counter; /* Open segment counter */ + }; + }; + char filename[UV__SEGMENT_FILENAME_BUF_SIZE]; /* Segment filename */ +}; + +/* Append a new item to the given segment info list if the given filename + * matches either the one of a closed segment (xxx-yyy) or the one of an open + * segment (open-xxx). */ +int uvSegmentInfoAppendIfMatch(const char *filename, + struct uvSegmentInfo *infos[], + size_t *n_infos, + bool *appended); + +/* Sort the given list of segments by comparing their filenames. Closed segments + * come before open segments. */ +void uvSegmentSort(struct uvSegmentInfo *infos, size_t n_infos); + +/* Keep only the closed segments whose entries are within the given trailing + * amount past the given snapshot last index. If the given trailing amount is 0, + * unconditionally delete all closed segments. */ +int uvSegmentKeepTrailing(struct uv *uv, + struct uvSegmentInfo *segments, + size_t n, + raft_index last_index, + size_t trailing, + char *errmsg); + +/* Load all entries contained in the given closed segment. */ +int uvSegmentLoadClosed(struct uv *uv, + struct uvSegmentInfo *segment, + struct raft_entry *entries[], + size_t *n); + +/* Load raft entries from the given segments. The @start_index is the expected + * index of the first entry of the first segment. */ +int uvSegmentLoadAll(struct uv *uv, + const raft_index start_index, + struct uvSegmentInfo *segments, + size_t n_segments, + struct raft_entry **entries, + size_t *n_entries); + +/* Return the number of blocks in a segments. */ +#define uvSegmentBlocks(UV) (UV->segment_size / UV->block_size) + +/* A dynamically allocated buffer holding data to be written into a segment + * file. + * + * The memory is aligned at disk block boundary, to allow for direct I/O. */ +struct uvSegmentBuffer +{ + size_t block_size; /* Disk block size for direct I/O */ + uv_buf_t arena; /* Previously allocated memory that can be re-used */ + size_t n; /* Write offset */ +}; + +/* Initialize an empty buffer. */ +void uvSegmentBufferInit(struct uvSegmentBuffer *b, size_t block_size); + +/* Release all memory used by the buffer. */ +void uvSegmentBufferClose(struct uvSegmentBuffer *b); + +/* Encode the format version at the very beginning of the buffer. This function + * must be called when the buffer is empty. */ +int uvSegmentBufferFormat(struct uvSegmentBuffer *b); + +/* Extend the segment's buffer by encoding the given entries. + * + * Previous data in the buffer will be retained, and data for these new entries + * will be appended. */ +int uvSegmentBufferAppend(struct uvSegmentBuffer *b, + const struct raft_entry entries[], + unsigned n_entries); + +/* After all entries to write have been encoded, finalize the buffer by zeroing + * the unused memory of the last block. The out parameter will point to the + * memory to write. */ +void uvSegmentBufferFinalize(struct uvSegmentBuffer *b, uv_buf_t *out); + +/* Reset the buffer preparing it for the next segment write. + * + * If the retain parameter is greater than zero, then the data of the retain'th + * block will be copied at the beginning of the buffer and the write offset will + * be set accordingly. */ +void uvSegmentBufferReset(struct uvSegmentBuffer *b, unsigned retain); + +/* Write a closed segment, containing just one entry at the given index + * for the given configuration. */ +int uvSegmentCreateClosedWithConfiguration( + struct uv *uv, + raft_index index, + const struct raft_configuration *configuration); + +/* Write the first closed segment, containing just one entry for the given + * configuration. */ +int uvSegmentCreateFirstClosed(struct uv *uv, + const struct raft_configuration *configuration); + +/* Truncate a segment that was already closed. */ +int uvSegmentTruncate(struct uv *uv, + struct uvSegmentInfo *segment, + raft_index index); + +/* Info about a persisted snapshot stored in snapshot metadata file. */ +struct uvSnapshotInfo +{ + raft_term term; + raft_index index; + unsigned long long timestamp; + char filename[UV__FILENAME_LEN]; +}; + +/* Render the filename of the data file of a snapshot */ +void uvSnapshotFilenameOf(struct uvSnapshotInfo *info, char *filename); + +/* Upon success `orphan` will be true if filename is a snapshot file without a + * sibling .meta file */ +int UvSnapshotIsOrphan(const char *dir, const char *filename, bool *orphan); + +/* Upon success `orphan` will be true if filename is a snapshot .meta file + * without a sibling snapshot file */ +int UvSnapshotMetaIsOrphan(const char *dir, const char *filename, bool *orphan); + +/* Append a new item to the given snapshot info list if the given filename + * matches the pattern of a snapshot metadata file (snapshot-xxx-yyy-zzz.meta) + * and there is actually a matching non-empty snapshot file on disk. */ +int UvSnapshotInfoAppendIfMatch(struct uv *uv, + const char *filename, + struct uvSnapshotInfo *infos[], + size_t *n_infos, + bool *appended); + +/* Sort the given list of snapshots by comparing their filenames. Older + * snapshots will come first. */ +void UvSnapshotSort(struct uvSnapshotInfo *infos, size_t n_infos); + +/* Load the snapshot associated with the given metadata. */ +int UvSnapshotLoad(struct uv *uv, + struct uvSnapshotInfo *meta, + struct raft_snapshot *snapshot, + char *errmsg); + +/* Implementation raft_io->snapshot_put (defined in uv_snapshot.c). */ +int UvSnapshotPut(struct raft_io *io, + unsigned trailing, + struct raft_io_snapshot_put *req, + const struct raft_snapshot *snapshot, + raft_io_snapshot_put_cb cb); + +/* Implementation of raft_io->snapshot_get (defined in uv_snapshot.c). */ +int UvSnapshotGet(struct raft_io *io, + struct raft_io_snapshot_get *req, + raft_io_snapshot_get_cb cb); + +/* Implementation of raft_io->async_work (defined in uv_work.c). */ +int UvAsyncWork(struct raft_io *io, + struct raft_io_async_work *req, + raft_io_async_work_cb cb); + +/* Return a list of all snapshots and segments found in the data directory. Both + * snapshots and segments are ordered by filename (closed segments come before + * open ones). */ +int UvList(struct uv *uv, + struct uvSnapshotInfo *snapshots[], + size_t *n_snapshots, + struct uvSegmentInfo *segments[], + size_t *n_segments, + char *errmsg); + +/* Request to obtain a newly prepared open segment. */ +struct uvPrepare; +typedef void (*uvPrepareCb)(struct uvPrepare *req, int status); +struct uvPrepare +{ + void *data; /* User data */ + uv_file fd; /* Resulting segment file descriptor */ + unsigned long long counter; /* Resulting segment counter */ + uvPrepareCb cb; /* Completion callback */ + queue queue; /* Links in uv_io->prepare_reqs */ +}; + +/* Get a prepared open segment ready for writing. If a prepared open segment is + * already available in the pool, it will be returned immediately using the fd + * and counter pointers and the request callback won't be invoked. Otherwise the + * request will be queued and its callback invoked once a newly prepared segment + * is available. */ +int UvPrepare(struct uv *uv, + uv_file *fd, + uvCounter *counter, + struct uvPrepare *req, + uvPrepareCb cb); + +/* Cancel all pending prepare requests and start removing all unused prepared + * open segments. If a segment currently being created, wait for it to complete + * and then remove it immediately. */ +void UvPrepareClose(struct uv *uv); + +/* Implementation of raft_io->append. All the raft_buffers of the raft_entry + * structs in the entries array are required to have a len that is a multiple + * of 8. */ +int UvAppend(struct raft_io *io, + struct raft_io_append *req, + const struct raft_entry entries[], + unsigned n, + raft_io_append_cb cb); + +/* Pause request object and callback. */ +struct UvBarrierReq; + +/* A barrier cb that plans to perform work on the threadpool MUST exit early + * and cleanup resources when it detects uv->closing, this is to allow forced + * closing on shutdown. */ +typedef void (*UvBarrierCb)(struct UvBarrierReq *req); +struct UvBarrierReq +{ + bool blocking; /* Whether this barrier should block future writes */ + void *data; /* User data */ + UvBarrierCb cb; /* Completion callback */ + queue queue; /* Queue of reqs triggered by a UvBarrier */ +}; + +struct UvBarrier +{ + bool blocking; /* Whether this barrier should block future writes */ + queue reqs; /* Queue of UvBarrierReq */ +}; + +/* Submit a barrier request to interrupt the normal flow of append + * operations. + * + * The following will happen: + * + * - Replace uv->append_next_index with the given next_index, so the next entry + * that will be appended will have the new index. + * + * - Execution of new writes for subsequent append requests will be blocked + * until UvUnblock is called when the barrier is blocking. + * + * - Wait for all currently pending and inflight append requests against all + * open segments to complete, and for those open segments to be finalized, + * then invoke the barrier callback. + * + * This API is used to implement truncate and snapshot install operations, which + * need to wait until all pending writes have settled and modify the log state, + * changing the next index. */ +int UvBarrier(struct uv *uv, raft_index next_index, struct UvBarrierReq *req); + +/* Trigger a callback for a barrier request in this @barrier. Returns true if a + * callback was triggered, false if there are no more requests to trigger. + * A barrier callback will call UvUnblock, which in turn will try to run the + * next callback, if any, from a barrier request in this barrier. */ +bool UvBarrierMaybeTrigger(struct UvBarrier *barrier); + +/* Add a Barrier @req to an existing @barrier. */ +void UvBarrierAddReq(struct UvBarrier *barrier, struct UvBarrierReq *req); + +/* Returns @true if there are no more segments referencing uv->barrier */ +bool UvBarrierReady(struct uv *uv); + +/* Resume writing append requests after UvBarrier has been called. */ +void UvUnblock(struct uv *uv); + +/* Cancel all pending write requests and request the current segment to be + * finalized. Must be invoked at closing time. */ +void uvAppendClose(struct uv *uv); + +/* Submit a request to finalize the open segment with the given counter. + * + * Requests are processed one at a time, to avoid ending up closing open segment + * N + 1 before closing open segment N. */ +int UvFinalize(struct uv *uv, + unsigned long long counter, + size_t used, + raft_index first_index, + raft_index last_index); + +/* Implementation of raft_io->send. */ +int UvSend(struct raft_io *io, + struct raft_io_send *req, + const struct raft_message *message, + raft_io_send_cb cb); + +/* Stop all clients by closing the outbound stream handles and canceling all + * pending send requests. */ +void UvSendClose(struct uv *uv); + +/* Start receiving messages from new incoming connections. */ +int UvRecvStart(struct uv *uv); + +/* Stop all servers by closing the inbound stream handles and aborting all + * requests being received. */ +void UvRecvClose(struct uv *uv); + +void uvMaybeFireCloseCb(struct uv *uv); + +#endif /* UV_H_ */ diff --git a/src/raft/uv_append.c b/src/raft/uv_append.c new file mode 100644 index 000000000..9feb66ca5 --- /dev/null +++ b/src/raft/uv_append.c @@ -0,0 +1,1034 @@ +#include "assert.h" +#include "byte.h" +#include "heap.h" +#include "queue.h" +#include "uv.h" +#include "uv_encoding.h" +#include "uv_writer.h" + +/* The happy path for an append request is: + * + * - If there is a current segment and it is has enough spare capacity to hold + * the entries in the request, then queue the request, linking it to the + * current segment. + * + * - If there is no current segment, or it hasn't enough spare capacity to hold + * the entries in the request, then request a new open segment to be prepared, + * queue the request and link it to the newly requested segment. + * + * - Wait for any pending write against the current segment to complete, and + * also for the prepare request if we asked for a new segment. Also wait for + * any in progress barrier to be removed. + * + * - Submit a write request for the entries in this append request. The write + * request might contain other append requests targeted to the current segment + * that might have accumulated in the meantime, if we have been waiting for a + * segment to be prepared, or for the previous write to complete or for a + * barrier to be removed. + * + * - Wait for the write request to finish and fire the append request's + * callback. + * + * Possible failure modes are: + * + * - The request to prepare a new segment fails. + * - The write request fails. + * - The request to finalize a new segment fails to be submitted. + * + * In all these cases we mark the instance as errored and fire the relevant + * callbacks. + **/ + +/* An open segment being written or waiting to be written. */ +struct uvAliveSegment +{ + struct uv *uv; /* Our writer */ + struct uvPrepare prepare; /* Prepare segment file request */ + struct UvWriter writer; /* Writer to perform async I/O */ + struct UvWriterReq write; /* Write request */ + unsigned long long counter; /* Open segment counter */ + raft_index first_index; /* Index of the first entry written */ + raft_index pending_last_index; /* Index of the last entry written */ + size_t size; /* Total number of bytes used */ + unsigned next_block; /* Next segment block to write */ + struct uvSegmentBuffer pending; /* Buffer for data yet to be written */ + uv_buf_t buf; /* Write buffer for current write */ + raft_index last_index; /* Last entry actually written */ + size_t written; /* Number of bytes actually written */ + queue queue; /* Segment queue */ + struct UvBarrier *barrier; /* Barrier waiting on this segment */ + bool finalize; /* Finalize the segment after writing */ +}; + +struct uvAppend +{ + struct raft_io_append *req; /* User request */ + const struct raft_entry *entries; /* Entries to write */ + unsigned n; /* Number of entries */ + struct uvAliveSegment *segment; /* Segment to write to */ + queue queue; +}; + +static void uvAliveSegmentWriterCloseCb(struct UvWriter *writer) +{ + struct uvAliveSegment *segment = writer->data; + struct uv *uv = segment->uv; + uvSegmentBufferClose(&segment->pending); + RaftHeapFree(segment); + uvMaybeFireCloseCb(uv); +} + +/* Submit a request to close the current open segment. */ +static void uvAliveSegmentFinalize(struct uvAliveSegment *s) +{ + struct uv *uv = s->uv; + int rv; + + rv = UvFinalize(uv, s->counter, s->written, s->first_index, + s->last_index); + if (rv != 0) { + uv->errored = true; + /* We failed to submit the finalize request, but let's still + * close the file handle and release the segment memory. */ + } + + QUEUE_REMOVE(&s->queue); + UvWriterClose(&s->writer, uvAliveSegmentWriterCloseCb); +} + +/* Flush the append requests in the given queue, firing their callbacks with the + * given status. */ +static void uvAppendFinishRequestsInQueue(struct uv *uv, queue *q, int status) +{ + queue queue_copy; + struct uvAppend *append; + QUEUE_INIT(&queue_copy); + while (!QUEUE_IS_EMPTY(q)) { + queue *head; + head = QUEUE_HEAD(q); + append = QUEUE_DATA(head, struct uvAppend, queue); + /* Rollback the append next index if the result was + * unsuccessful. */ + if (status != 0) { + tracef("rollback uv->append_next_index was:%llu", + uv->append_next_index); + uv->append_next_index -= append->n; + tracef("rollback uv->append_next_index now:%llu", + uv->append_next_index); + } + QUEUE_REMOVE(head); + QUEUE_PUSH(&queue_copy, head); + } + while (!QUEUE_IS_EMPTY(&queue_copy)) { + queue *head; + struct raft_io_append *req; + head = QUEUE_HEAD(&queue_copy); + append = QUEUE_DATA(head, struct uvAppend, queue); + QUEUE_REMOVE(head); + req = append->req; + RaftHeapFree(append); + req->cb(req, status); + } +} + +/* Flush the append requests in the writing queue, firing their callbacks with + * the given status. */ +static void uvAppendFinishWritingRequests(struct uv *uv, int status) +{ + uvAppendFinishRequestsInQueue(uv, &uv->append_writing_reqs, status); +} + +/* Flush the append requests in the pending queue, firing their callbacks with + * the given status. */ +static void uvAppendFinishPendingRequests(struct uv *uv, int status) +{ + uvAppendFinishRequestsInQueue(uv, &uv->append_pending_reqs, status); +} + +/* Return the segment currently being written, or NULL when no segment has been + * written yet. */ +static struct uvAliveSegment *uvGetCurrentAliveSegment(struct uv *uv) +{ + queue *head; + if (QUEUE_IS_EMPTY(&uv->append_segments)) { + return NULL; + } + head = QUEUE_HEAD(&uv->append_segments); + return QUEUE_DATA(head, struct uvAliveSegment, queue); +} + +/* Extend the segment's write buffer by encoding the entries in the given + * request into it. IOW, previous data in the write buffer will be retained, and + * data for these new entries will be appended. */ +static int uvAliveSegmentEncodeEntriesToWriteBuf(struct uvAliveSegment *segment, + struct uvAppend *append) +{ + int rv; + assert(append->segment == segment); + + /* If this is the very first write to the segment, we need to include + * the format version */ + if (segment->pending.n == 0 && segment->next_block == 0) { + rv = uvSegmentBufferFormat(&segment->pending); + if (rv != 0) { + return rv; + } + } + + rv = uvSegmentBufferAppend(&segment->pending, append->entries, + append->n); + if (rv != 0) { + return rv; + } + + segment->pending_last_index += append->n; + + return 0; +} + +static int uvAppendMaybeStart(struct uv *uv); +static void uvAliveSegmentWriteCb(struct UvWriterReq *write, const int status) +{ + struct uvAliveSegment *s = write->data; + struct uv *uv = s->uv; + unsigned n_blocks; + int rv; + + assert(uv->state != UV__CLOSED); + + assert(s->buf.len % uv->block_size == 0); + assert(s->buf.len >= uv->block_size); + + /* Check if the write was successful. */ + if (status != 0) { + tracef("write: %s", uv->io->errmsg); + uv->errored = true; + goto out; + } + + s->written = s->next_block * uv->block_size + s->pending.n; + s->last_index = s->pending_last_index; + + /* Update our write markers. + * + * We have four cases: + * + * - The data fit completely in the leftover space of the first block + * that we wrote and there is more space left. In this case we just keep + * the scheduled marker unchanged. + * + * - The data fit completely in the leftover space of the first block + * that we wrote and there is no space left. In this case we advance the + * current block counter, reset the first write block and set the + * scheduled marker to 0. + * + * - The data did not fit completely in the leftover space of the first + * block that we wrote, so we wrote more than one block. The last + * block that we wrote was not filled completely and has leftover space. + * In this case we advance the current block counter and copy the memory + * used for the last block to the head of the write arena list, updating + * the scheduled marker accordingly. + * + * - The data did not fit completely in the leftover space of the first + * block that we wrote, so we wrote more than one block. The last + * block that we wrote was filled exactly and has no leftover space. In + * this case we advance the current block counter, reset the first + * buffer and set the scheduled marker to 0. + */ + n_blocks = (unsigned)(s->buf.len / + uv->block_size); /* Number of blocks written. */ + if (s->pending.n < uv->block_size) { + /* Nothing to do */ + assert(n_blocks == 1); + } else if (s->pending.n == uv->block_size) { + assert(n_blocks == 1); + s->next_block++; + uvSegmentBufferReset(&s->pending, 0); + } else { + assert(s->pending.n > uv->block_size); + assert(s->buf.len > uv->block_size); + + if (s->pending.n % uv->block_size > 0) { + s->next_block += n_blocks - 1; + uvSegmentBufferReset(&s->pending, n_blocks - 1); + } else { + s->next_block += n_blocks; + uvSegmentBufferReset(&s->pending, 0); + } + } + +out: + /* Fire the callbacks of all requests that were fulfilled with this + * write. */ + uvAppendFinishWritingRequests(uv, status); + if (status != 0) { + /* When the write has failed additionally cancel all future + * append related activity. This will also rewind + * uv->append_next_index. All append requests need to be + * canceled because raft assumes all appends happen in order and + * if an append fails (and is not retried), we would be missing + * a sequence of log entries on disk. The implementation can't + * handle that + the accounting of the append index would be + * off. + */ + uvAppendFinishPendingRequests(uv, status); + /* Allow this segment to be finalized further down. Don't bother + * rewinding state to possibly reuse the segment for writing, + * it's too bug-prone. */ + s->pending_last_index = s->last_index; + s->finalize = true; + } + + /* During the closing sequence we should have already canceled all + * pending request. */ + if (uv->closing) { + assert(QUEUE_IS_EMPTY(&uv->append_pending_reqs)); + assert(s->finalize); + uvAliveSegmentFinalize(s); + return; + } + + /* Possibly process waiting requests. */ + if (!QUEUE_IS_EMPTY(&uv->append_pending_reqs)) { + rv = uvAppendMaybeStart(uv); + if (rv != 0) { + uv->errored = true; + } + } else if (s->finalize && (s->pending_last_index == s->last_index) && + !s->writer.closing) { + /* If there are no more append_pending_reqs or write requests in + * flight, this segment must be finalized here in case we don't + * receive AppendEntries RPCs anymore (could happen during a + * Snapshot install, causing the BarrierCb to never fire), but + * check that the callbacks that fired after completion of this + * write didn't already close the segment. */ + uvAliveSegmentFinalize(s); + } +} + +/* Submit a file write request to append the entries encoded in the write buffer + * of the given segment. */ +static int uvAliveSegmentWrite(struct uvAliveSegment *s) +{ + int rv; + assert(s->counter != 0); + assert(s->pending.n > 0); + uvSegmentBufferFinalize(&s->pending, &s->buf); + rv = UvWriterSubmit(&s->writer, &s->write, &s->buf, 1, + s->next_block * s->uv->block_size, + uvAliveSegmentWriteCb); + if (rv != 0) { + return rv; + } + return 0; +} + +/* Start writing all pending append requests for the current segment, unless we + * are already writing, or the segment itself has not yet been prepared or we + * are blocked on a barrier. If there are no more requests targeted at the + * current segment, make sure it's marked to be finalize and try with the next + * segment. */ +static int uvAppendMaybeStart(struct uv *uv) +{ + struct uvAliveSegment *segment; + struct uvAppend *append; + unsigned n_reqs; + queue *head; + queue q; + int rv; + + assert(!uv->closing); + assert(!QUEUE_IS_EMPTY(&uv->append_pending_reqs)); + + /* If we are already writing, let's wait. */ + if (!QUEUE_IS_EMPTY(&uv->append_writing_reqs)) { + return 0; + } + +start: + segment = uvGetCurrentAliveSegment(uv); + assert(segment != NULL); + /* If the preparer isn't done yet, let's wait. */ + if (segment->counter == 0) { + return 0; + } + + /* If there's a blocking barrier in progress, and it's not waiting for + * this segment to be finalized, let's wait. + * + * FIXME shouldn't we wait even if segment->barrier == uv->barrier, if + * there are other open segments associated with the same barrier? */ + if (uv->barrier != NULL && segment->barrier != uv->barrier && + uv->barrier->blocking) { + return 0; + } + + /* If there's no barrier in progress and this segment is marked with a + * barrier, it means that this was a pending barrier, which we can + * become the current barrier now. */ + if (uv->barrier == NULL && segment->barrier != NULL) { + uv->barrier = segment->barrier; + } + + /* Let's add to the segment's write buffer all pending requests targeted + * to this segment. */ + QUEUE_INIT(&q); + + n_reqs = 0; + while (!QUEUE_IS_EMPTY(&uv->append_pending_reqs)) { + head = QUEUE_HEAD(&uv->append_pending_reqs); + append = QUEUE_DATA(head, struct uvAppend, queue); + assert(append->segment != NULL); + if (append->segment != segment) { + break; /* Not targeted to this segment */ + } + QUEUE_REMOVE(head); + QUEUE_PUSH(&q, head); + n_reqs++; + rv = uvAliveSegmentEncodeEntriesToWriteBuf(segment, append); + if (rv != 0) { + goto err; + } + } + + /* If we have no more requests for this segment, let's check if it has + * been marked for closing, and in that case finalize it and possibly + * trigger a write against the next segment (unless there is a truncate + * request, in that case we need to wait for it). Otherwise it must mean + * we have exhausted the queue of pending append requests. */ + if (n_reqs == 0) { + assert(QUEUE_IS_EMPTY(&uv->append_writing_reqs)); + if (segment->finalize) { + uvAliveSegmentFinalize(segment); + if (!QUEUE_IS_EMPTY(&uv->append_pending_reqs)) { + goto start; + } + } + assert(QUEUE_IS_EMPTY(&uv->append_pending_reqs)); + return 0; + } + + while (!QUEUE_IS_EMPTY(&q)) { + head = QUEUE_HEAD(&q); + QUEUE_REMOVE(head); + QUEUE_PUSH(&uv->append_writing_reqs, head); + } + + rv = uvAliveSegmentWrite(segment); + if (rv != 0) { + goto err; + } + + return 0; + +err: + assert(rv != 0); + return rv; +} + +/* Invoked when a newly added open segment becomes ready for writing, after the + * associated UvPrepare request completes (either synchronously or + * asynchronously). */ +static int uvAliveSegmentReady(struct uv *uv, + uv_file fd, + uvCounter counter, + struct uvAliveSegment *segment) +{ + int rv; + rv = UvWriterInit(&segment->writer, uv->loop, fd, uv->direct_io, + uv->async_io, 1, uv->io->errmsg); + if (rv != 0) { + ErrMsgWrapf(uv->io->errmsg, "setup writer for open-%llu", + counter); + return rv; + } + segment->counter = counter; + return 0; +} + +static void uvAliveSegmentPrepareCb(struct uvPrepare *req, int status) +{ + struct uvAliveSegment *segment = req->data; + struct uv *uv = segment->uv; + int rv; + + assert(segment->counter == 0); + assert(segment->written == 0); + + /* If we have been closed, let's discard the segment. */ + if (uv->closing) { + QUEUE_REMOVE(&segment->queue); + assert(status == + RAFT_CANCELED); /* UvPrepare cancels pending reqs */ + uvSegmentBufferClose(&segment->pending); + RaftHeapFree(segment); + return; + } + + if (status != 0) { + tracef("prepare segment failed (%d)", status); + rv = status; + goto err; + } + + assert(req->counter > 0); + assert(req->fd >= 0); + + /* There must be pending appends that were waiting for this prepare + * requests. */ + assert(!QUEUE_IS_EMPTY(&uv->append_pending_reqs)); + + rv = uvAliveSegmentReady(uv, req->fd, req->counter, segment); + if (rv != 0) { + tracef("prepare segment ready failed (%d)", rv); + goto err; + } + + rv = uvAppendMaybeStart(uv); + if (rv != 0) { + tracef("prepare segment start failed (%d)", rv); + goto err; + } + + return; + +err: + QUEUE_REMOVE(&segment->queue); + RaftHeapFree(segment); + uv->errored = true; + uvAppendFinishPendingRequests(uv, rv); +} + +/* Initialize a new open segment object. */ +static void uvAliveSegmentInit(struct uvAliveSegment *s, struct uv *uv) +{ + s->uv = uv; + s->prepare.data = s; + s->writer.data = s; + s->write.data = s; + s->counter = 0; + s->first_index = uv->append_next_index; + s->pending_last_index = s->first_index - 1; + s->last_index = 0; + s->size = sizeof(uint64_t) /* Format version */; + s->next_block = 0; + uvSegmentBufferInit(&s->pending, uv->block_size); + s->written = 0; + s->barrier = NULL; + s->finalize = false; +} + +/* Add a new active open segment, since the append request being submitted does + * not fit in the last segment we scheduled writes for, or no segment had been + * previously requested at all. */ +static int uvAppendPushAliveSegment(struct uv *uv) +{ + struct uvAliveSegment *segment; + uv_file fd; + uvCounter counter; + int rv; + + segment = RaftHeapMalloc(sizeof *segment); + if (segment == NULL) { + rv = RAFT_NOMEM; + goto err; + } + uvAliveSegmentInit(segment, uv); + + QUEUE_PUSH(&uv->append_segments, &segment->queue); + + rv = UvPrepare(uv, &fd, &counter, &segment->prepare, + uvAliveSegmentPrepareCb); + if (rv != 0) { + goto err_after_alloc; + } + + /* If we've been returned a ready prepared segment right away, start + * writing to it immediately. */ + if (fd != -1) { + rv = uvAliveSegmentReady(uv, fd, counter, segment); + if (rv != 0) { + goto err_after_prepare; + } + } + return 0; + +err_after_prepare: + UvOsClose(fd); + UvFinalize(uv, counter, 0, 0, 0); +err_after_alloc: + QUEUE_REMOVE(&segment->queue); + RaftHeapFree(segment); +err: + assert(rv != 0); + return rv; +} + +/* Return the last segment that we have requested to prepare. */ +static struct uvAliveSegment *uvGetLastAliveSegment(struct uv *uv) +{ + queue *tail; + if (QUEUE_IS_EMPTY(&uv->append_segments)) { + return NULL; + } + tail = QUEUE_TAIL(&uv->append_segments); + return QUEUE_DATA(tail, struct uvAliveSegment, queue); +} + +/* Return #true if the remaining capacity of the given segment is equal or + * greater than @size. */ +static bool uvAliveSegmentHasEnoughSpareCapacity(struct uvAliveSegment *s, + size_t size) +{ + return s->size + size <= s->uv->segment_size; +} + +/* Add @size bytes to the number of bytes that the segment will hold. The actual + * write will happen when the previous write completes, if any. */ +static void uvAliveSegmentReserveSegmentCapacity(struct uvAliveSegment *s, + size_t size) +{ + s->size += size; +} + +/* Return the number of bytes needed to store the batch of entries of this + * append request on disk. */ +static size_t uvAppendSize(struct uvAppend *a) +{ + size_t size = sizeof(uint32_t) * 2; /* CRC checksums */ + unsigned i; + size += uvSizeofBatchHeader(a->n); /* Batch header */ + for (i = 0; i < a->n; i++) { /* Entries data */ + size += bytePad64(a->entries[i].buf.len); + } + return size; +} + +/* Enqueue an append entries request, assigning it to the appropriate active + * open segment. */ +static int uvAppendEnqueueRequest(struct uv *uv, struct uvAppend *append) +{ + struct uvAliveSegment *segment; + size_t size; + bool fits; + int rv; + + assert(append->entries != NULL); + assert(append->n > 0); + assert(uv->append_next_index > 0); + tracef("enqueue %u entries", append->n); + + size = uvAppendSize(append); + + /* If we have no segments yet, it means this is the very first append, + * and we need to add a new segment. Otherwise we check if the last + * segment has enough room for this batch of entries. */ + segment = uvGetLastAliveSegment(uv); + if (segment == NULL || segment->finalize) { + fits = false; + } else { + fits = uvAliveSegmentHasEnoughSpareCapacity(segment, size); + if (!fits) { + segment->finalize = + true; /* Finalize when all writes are done */ + } + } + + /* If there's no segment or if this batch does not fit in this segment, + * we need to add a new one. */ + if (!fits) { + rv = uvAppendPushAliveSegment(uv); + if (rv != 0) { + goto err; + } + } + + segment = uvGetLastAliveSegment(uv); /* Get the last added segment */ + uvAliveSegmentReserveSegmentCapacity(segment, size); + + append->segment = segment; + QUEUE_PUSH(&uv->append_pending_reqs, &append->queue); + uv->append_next_index += append->n; + tracef("set uv->append_next_index %llu", uv->append_next_index); + + return 0; + +err: + assert(rv != 0); + return rv; +} + +/* Check that all entry buffers are 8-byte aligned */ +static int uvCheckEntryBuffersAligned(struct uv *uv, + const struct raft_entry entries[], + unsigned n) +{ + unsigned i; + + for (i = 0; i < n; i++) { + if (entries[i].buf.len % 8) { + ErrMsgPrintf(uv->io->errmsg, + "entry buffers must be 8-byte aligned"); + tracef("%s", uv->io->errmsg); + return RAFT_INVALID; + } + } + + return 0; +} + +int UvAppend(struct raft_io *io, + struct raft_io_append *req, + const struct raft_entry entries[], + unsigned n, + raft_io_append_cb cb) +{ + struct uv *uv; + struct uvAppend *append; + int rv; + + uv = io->impl; + assert(!uv->closing); + + append = RaftHeapCalloc(1, sizeof *append); + if (append == NULL) { + rv = RAFT_NOMEM; + goto err; + } + append->req = req; + append->entries = entries; + append->n = n; + req->cb = cb; + + rv = uvCheckEntryBuffersAligned(uv, entries, n); + if (rv != 0) { + goto err_after_req_alloc; + } + + rv = uvAppendEnqueueRequest(uv, append); + if (rv != 0) { + goto err_after_req_alloc; + } + + assert(append->segment != NULL); + assert(!QUEUE_IS_EMPTY(&uv->append_pending_reqs)); + + /* Try to write immediately. */ + rv = uvAppendMaybeStart(uv); + if (rv != 0) { + return rv; + } + + return 0; + +err_after_req_alloc: + RaftHeapFree(append); +err: + assert(rv != 0); + return rv; +} + +/* Finalize the current segment as soon as all its pending or inflight append + * requests get completed. */ +static void uvFinalizeCurrentAliveSegmentOnceIdle(struct uv *uv) +{ + struct uvAliveSegment *s; + queue *head; + bool has_pending_reqs; + bool has_writing_reqs; + + s = uvGetCurrentAliveSegment(uv); + if (s == NULL) { + return; + } + + /* Check if there are pending append requests targeted to the current + * segment. */ + has_pending_reqs = false; + QUEUE_FOREACH(head, &uv->append_pending_reqs) + { + struct uvAppend *r = QUEUE_DATA(head, struct uvAppend, queue); + if (r->segment == s) { + has_pending_reqs = true; + break; + } + } + has_writing_reqs = !QUEUE_IS_EMPTY(&uv->append_writing_reqs); + + /* If there is no pending append request or inflight write against the + * current segment, we can submit a request for it to be closed + * immediately. Otherwise, we set the finalize flag. + * + * TODO: is it actually possible to have pending requests with no + * writing requests? Probably no. */ + if (!has_pending_reqs && !has_writing_reqs) { + uvAliveSegmentFinalize(s); + } else { + s->finalize = true; + } +} + +bool UvBarrierReady(struct uv *uv) +{ + if (uv->barrier == NULL) { + return true; + } + + queue *head; + QUEUE_FOREACH(head, &uv->append_segments) + { + struct uvAliveSegment *segment; + segment = QUEUE_DATA(head, struct uvAliveSegment, queue); + if (segment->barrier == uv->barrier) { + return false; + } + } + return true; +} + +bool UvBarrierMaybeTrigger(struct UvBarrier *barrier) +{ + if (!barrier) { + return false; + } + + if (!QUEUE_IS_EMPTY(&barrier->reqs)) { + queue *head; + struct UvBarrierReq *r; + head = QUEUE_HEAD(&barrier->reqs); + QUEUE_REMOVE(head); + r = QUEUE_DATA(head, struct UvBarrierReq, queue); + r->cb(r); + return true; + } + + return false; +} + +/* Used during cleanup. */ +static void uvBarrierTriggerAll(struct UvBarrier *barrier) +{ + while (UvBarrierMaybeTrigger(barrier)) { + ; + } +} + +static struct UvBarrier *uvBarrierCreate(void) +{ + struct UvBarrier *barrier; + barrier = RaftHeapCalloc(1, sizeof(*barrier)); + if (!barrier) { + return NULL; + } + barrier->blocking = false; + QUEUE_INIT(&barrier->reqs); + return barrier; +} + +int UvBarrier(struct uv *uv, raft_index next_index, struct UvBarrierReq *req) +{ + /* The barrier to attach to. */ + struct UvBarrier *barrier = NULL; + struct uvAliveSegment *segment = NULL; + queue *head; + + assert(!uv->closing); + + /* The next entry will be appended at this index. */ + uv->append_next_index = next_index; + tracef("UvBarrier uv->append_next_index:%llu", uv->append_next_index); + + /* Arrange for all open segments not already involved in other barriers + * to be finalized as soon as their append requests get completed and + * mark them as involved in this specific barrier request. */ + QUEUE_FOREACH(head, &uv->append_segments) + { + segment = QUEUE_DATA(head, struct uvAliveSegment, queue); + if (segment->barrier != NULL) { + /* If a non-blocking barrier precedes this blocking + * request, we want to also block all future writes. */ + if (req->blocking) { + segment->barrier->blocking = true; + } + continue; + } + + if (!barrier) { + barrier = uvBarrierCreate(); + if (!barrier) { + return RAFT_NOMEM; + } + /* And add the request to the barrier. */ + UvBarrierAddReq(barrier, req); + } + segment->barrier = barrier; + + if (segment == uvGetCurrentAliveSegment(uv)) { + uvFinalizeCurrentAliveSegmentOnceIdle(uv); + continue; + } + segment->finalize = true; + } + + /* Unable to attach to a segment, because all segments are involved in a + * barrier, or there are no segments. */ + if (barrier == NULL) { + /* Attach req to last segment barrier. */ + if (segment != NULL) { + barrier = segment->barrier; + /* There is no segment, attach to uv->barrier. */ + } else if (uv->barrier != NULL) { + barrier = uv->barrier; + /* There is no uv->barrier, make new one. */ + } else { + barrier = uvBarrierCreate(); + if (!barrier) { + return RAFT_NOMEM; + } + } + UvBarrierAddReq(barrier, req); + } + + /* Let's not continue writing new entries if something down the line + * asked us to stop writing. */ + if (uv->barrier != NULL && req->blocking) { + uv->barrier->blocking = true; + } + + assert(barrier != NULL); + if (uv->barrier == NULL) { + uv->barrier = barrier; + /* If there's no pending append-related activity, we can fire + * the callback immediately. + * + * TODO: find a way to avoid invoking this synchronously. */ + if (QUEUE_IS_EMPTY(&uv->append_segments) && + QUEUE_IS_EMPTY(&uv->finalize_reqs) && + uv->finalize_work.data == NULL) { + /* Not interested in return value. */ + UvBarrierMaybeTrigger(barrier); + } + } + + return 0; +} + +void UvUnblock(struct uv *uv) +{ + /* First fire all pending barrier requests. Unblock will be called again + * when that request's callback is fired. */ + if (UvBarrierMaybeTrigger(uv->barrier)) { + tracef("UvUnblock triggered barrier request callback."); + return; + } + + /* All requests in barrier are finished. */ + tracef("UvUnblock queue empty"); + RaftHeapFree(uv->barrier); + uv->barrier = NULL; + if (uv->closing) { + uvMaybeFireCloseCb(uv); + return; + } + if (!QUEUE_IS_EMPTY(&uv->append_pending_reqs)) { + int rv; + rv = uvAppendMaybeStart(uv); + if (rv != 0) { + uv->errored = true; + } + } +} + +void UvBarrierAddReq(struct UvBarrier *barrier, struct UvBarrierReq *req) +{ + assert(barrier != NULL); + assert(req != NULL); + /* Once there's a blocking req, this barrier becomes blocking. */ + barrier->blocking |= req->blocking; + QUEUE_PUSH(&barrier->reqs, &req->queue); +} + +/* Fire all pending barrier requests, the barrier callback will notice that + * we're closing and abort there. */ +static void uvBarrierClose(struct uv *uv) +{ + tracef("uv barrier close"); + struct UvBarrier *barrier = NULL; + queue *head; + assert(uv->closing); + QUEUE_FOREACH(head, &uv->append_segments) + { + struct uvAliveSegment *segment; + segment = QUEUE_DATA(head, struct uvAliveSegment, queue); + if (segment->barrier != NULL && segment->barrier != barrier && + segment->barrier != uv->barrier) { + barrier = segment->barrier; + /* Fire all barrier cb's, this is safe because the + * barrier cb exits early when uv->closing is true. */ + uvBarrierTriggerAll(barrier); + RaftHeapFree(barrier); + } + /* The segment->barrier field is used: + * + * - by UvBarrierReady, to check whether it's time to invoke the + * barrier callback after successfully finalizing a segment + * - by uvAppendMaybeStart, to see whether we should go ahead + * with writing to a segment even though a barrier is active + * because the barrier is waiting on that same segment to be + * finalized (but see the + * FIXME in that function) + * - to save a barrier for later, if UvBarrier was called when + * uv->barrier was already set + * + * If we're cancelling the barrier, we don't need to save it for + * later; the callback will not be invoked a second time in any + * case; and uvAppendMaybeStart won't be called while closing. + * So it's fine to clear segment->barrier here. */ + segment->barrier = NULL; + } + + /* There might still be a current barrier set on uv->barrier, meaning + * that the open segment it was associated with has started to be + * finalized and is not anymore in the append_segments queue. Let's + * cancel all untriggered barrier request callbacks too. */ + if (uv->barrier != NULL) { + uvBarrierTriggerAll(uv->barrier); + /* Clear uv->barrier if there's no active work on the thread + * pool. When the work on the threadpool finishes, UvUnblock + * will notice we're closing, clear and free uv->barrier and + * call uvMaybeFireCloseCb. UnUnblock will not try to fire + * anymore barrier request callbacks because they were triggered + * in the line above. */ + if (uv->snapshot_put_work.data == NULL && + uv->truncate_work.data == NULL) { + RaftHeapFree(uv->barrier); + uv->barrier = NULL; + } + } +} + +void uvAppendClose(struct uv *uv) +{ + struct uvAliveSegment *segment; + assert(uv->closing); + + uvBarrierClose(uv); + UvPrepareClose(uv); + + uvAppendFinishPendingRequests(uv, RAFT_CANCELED); + + uvFinalizeCurrentAliveSegmentOnceIdle(uv); + + /* Also finalize the segments that we didn't write at all and are just + * sitting in the append_segments queue waiting for writes against the + * current segment to complete. */ + while (!QUEUE_IS_EMPTY(&uv->append_segments)) { + segment = uvGetLastAliveSegment(uv); + assert(segment != NULL); + if (segment == uvGetCurrentAliveSegment(uv)) { + break; /* We reached the head of the queue */ + } + assert(segment->written == 0); + uvAliveSegmentFinalize(segment); + } +} diff --git a/src/raft/uv_encoding.c b/src/raft/uv_encoding.c new file mode 100644 index 000000000..085192e83 --- /dev/null +++ b/src/raft/uv_encoding.c @@ -0,0 +1,581 @@ +#include "uv_encoding.h" + +#include +#include + +#include "../raft.h" +#include "assert.h" +#include "byte.h" +#include "configuration.h" + +/** + * Size of the request preamble. + */ +#define RAFT_IO_UV__PREAMBLE_SIZE \ + (sizeof(uint64_t) /* Message type. */ + \ + sizeof(uint64_t) /* Message size. */) + +static size_t sizeofRequestVoteV1(void) +{ + return sizeof(uint64_t) + /* Term. */ + sizeof(uint64_t) + /* Candidate ID. */ + sizeof(uint64_t) + /* Last log index. */ + sizeof(uint64_t) /* Last log term. */; +} + +static size_t sizeofRequestVote(void) +{ + return sizeofRequestVoteV1() + + sizeof(uint64_t) /* Leadership transfer. */; +} + +static size_t sizeofRequestVoteResultV1(void) +{ + return sizeof(uint64_t) + /* Term. */ + sizeof(uint64_t) /* Vote granted. */; +} + +static size_t sizeofRequestVoteResult(void) +{ + return sizeofRequestVoteResultV1() + /* Size of older version 1 message + */ + sizeof(uint64_t) /* Flags. */; +} + +static size_t sizeofAppendEntries(const struct raft_append_entries *p) +{ + return sizeof(uint64_t) + /* Leader's term. */ + sizeof(uint64_t) + /* Leader ID */ + sizeof(uint64_t) + /* Previous log entry index */ + sizeof(uint64_t) + /* Previous log entry term */ + sizeof(uint64_t) + /* Leader's commit index */ + sizeof(uint64_t) + /* Number of entries in the batch */ + 16 * p->n_entries /* One header per entry */; +} + +static size_t sizeofAppendEntriesResultV0(void) +{ + return sizeof(uint64_t) + /* Term. */ + sizeof(uint64_t) + /* Success. */ + sizeof(uint64_t) /* Last log index. */; +} + +static size_t sizeofAppendEntriesResult(void) +{ + return sizeofAppendEntriesResultV0() + + sizeof(uint64_t) /* 64 bit Flags. */; +} + +static size_t sizeofInstallSnapshot(const struct raft_install_snapshot *p) +{ + size_t conf_size = configurationEncodedSize(&p->conf); + return sizeof(uint64_t) + /* Leader's term. */ + sizeof(uint64_t) + /* Leader ID */ + sizeof(uint64_t) + /* Snapshot's last index */ + sizeof(uint64_t) + /* Term of last index */ + sizeof(uint64_t) + /* Configuration's index */ + sizeof(uint64_t) + /* Length of configuration */ + conf_size + /* Configuration data */ + sizeof(uint64_t); /* Length of snapshot data */ +} + +static size_t sizeofTimeoutNow(void) +{ + return sizeof(uint64_t) + /* Term. */ + sizeof(uint64_t) + /* Last log index. */ + sizeof(uint64_t) /* Last log term. */; +} + +size_t uvSizeofBatchHeader(size_t n) +{ + return 8 + /* Number of entries in the batch, little endian */ + 16 * n /* One header per entry */; +} + +static void encodeRequestVote(const struct raft_request_vote *p, void *buf) +{ + void *cursor = buf; + uint64_t flags = 0; + + if (p->disrupt_leader) { + flags |= 1 << 0; + } + if (p->pre_vote) { + flags |= 1 << 1; + } + + bytePut64(&cursor, p->term); + bytePut64(&cursor, p->candidate_id); + bytePut64(&cursor, p->last_log_index); + bytePut64(&cursor, p->last_log_term); + bytePut64(&cursor, flags); +} + +static void encodeRequestVoteResult(const struct raft_request_vote_result *p, + void *buf) +{ + void *cursor = buf; + uint64_t flags = 0; + + if (p->pre_vote) { + flags |= (1 << 0); + } + + bytePut64(&cursor, p->term); + bytePut64(&cursor, p->vote_granted); + bytePut64(&cursor, flags); +} + +static void encodeAppendEntries(const struct raft_append_entries *p, void *buf) +{ + void *cursor; + + cursor = buf; + + bytePut64(&cursor, p->term); /* Leader's term. */ + bytePut64(&cursor, p->prev_log_index); /* Previous index. */ + bytePut64(&cursor, p->prev_log_term); /* Previous term. */ + bytePut64(&cursor, p->leader_commit); /* Commit index. */ + + uvEncodeBatchHeader(p->entries, p->n_entries, cursor); +} + +static void encodeAppendEntriesResult( + const struct raft_append_entries_result *p, + void *buf) +{ + void *cursor = buf; + + bytePut64(&cursor, p->term); + bytePut64(&cursor, p->rejected); + bytePut64(&cursor, p->last_log_index); + bytePut64(&cursor, p->features); +} + +static void encodeInstallSnapshot(const struct raft_install_snapshot *p, + void *buf) +{ + void *cursor; + size_t conf_size = configurationEncodedSize(&p->conf); + + cursor = buf; + + bytePut64(&cursor, p->term); /* Leader's term. */ + bytePut64(&cursor, p->last_index); /* Snapshot last index. */ + bytePut64(&cursor, p->last_term); /* Term of last index. */ + bytePut64(&cursor, p->conf_index); /* Configuration index. */ + bytePut64(&cursor, conf_size); /* Configuration length. */ + configurationEncodeToBuf(&p->conf, cursor); + cursor = (uint8_t *)cursor + conf_size; + bytePut64(&cursor, p->data.len); /* Snapshot data size. */ +} + +static void encodeTimeoutNow(const struct raft_timeout_now *p, void *buf) +{ + void *cursor = buf; + + bytePut64(&cursor, p->term); + bytePut64(&cursor, p->last_log_index); + bytePut64(&cursor, p->last_log_term); +} + +int uvEncodeMessage(const struct raft_message *message, + uv_buf_t **bufs, + unsigned *n_bufs) +{ + uv_buf_t header; + void *cursor; + + /* Figure out the length of the header for this request and allocate a + * buffer for it. */ + header.len = RAFT_IO_UV__PREAMBLE_SIZE; + switch (message->type) { + case RAFT_IO_REQUEST_VOTE: + header.len += sizeofRequestVote(); + break; + case RAFT_IO_REQUEST_VOTE_RESULT: + header.len += sizeofRequestVoteResult(); + break; + case RAFT_IO_APPEND_ENTRIES: + header.len += + sizeofAppendEntries(&message->append_entries); + break; + case RAFT_IO_APPEND_ENTRIES_RESULT: + header.len += sizeofAppendEntriesResult(); + break; + case RAFT_IO_INSTALL_SNAPSHOT: + header.len += + sizeofInstallSnapshot(&message->install_snapshot); + break; + case RAFT_IO_TIMEOUT_NOW: + header.len += sizeofTimeoutNow(); + break; + default: + return RAFT_MALFORMED; + }; + + header.base = raft_malloc(header.len); + if (header.base == NULL) { + goto oom; + } + + cursor = header.base; + + /* Encode the request preamble, with message type and message size. */ + bytePut64(&cursor, message->type); + bytePut64(&cursor, header.len - RAFT_IO_UV__PREAMBLE_SIZE); + + /* Encode the request header. */ + switch (message->type) { + case RAFT_IO_REQUEST_VOTE: + encodeRequestVote(&message->request_vote, cursor); + break; + case RAFT_IO_REQUEST_VOTE_RESULT: + encodeRequestVoteResult(&message->request_vote_result, + cursor); + break; + case RAFT_IO_APPEND_ENTRIES: + encodeAppendEntries(&message->append_entries, cursor); + break; + case RAFT_IO_APPEND_ENTRIES_RESULT: + encodeAppendEntriesResult( + &message->append_entries_result, cursor); + break; + case RAFT_IO_INSTALL_SNAPSHOT: + encodeInstallSnapshot(&message->install_snapshot, + cursor); + break; + case RAFT_IO_TIMEOUT_NOW: + encodeTimeoutNow(&message->timeout_now, cursor); + break; + }; + + *n_bufs = 1; + + /* For AppendEntries request we also send the entries payload. */ + if (message->type == RAFT_IO_APPEND_ENTRIES) { + *n_bufs += message->append_entries.n_entries; + } + + /* For InstallSnapshot request we also send the snapshot payload. */ + if (message->type == RAFT_IO_INSTALL_SNAPSHOT) { + *n_bufs += 1; + } + + *bufs = raft_calloc(*n_bufs, sizeof **bufs); + if (*bufs == NULL) { + goto oom_after_header_alloc; + } + + (*bufs)[0] = header; + + if (message->type == RAFT_IO_APPEND_ENTRIES) { + unsigned i; + for (i = 0; i < message->append_entries.n_entries; i++) { + const struct raft_entry *entry = + &message->append_entries.entries[i]; + (*bufs)[i + 1].base = entry->buf.base; + (*bufs)[i + 1].len = entry->buf.len; + } + } + + if (message->type == RAFT_IO_INSTALL_SNAPSHOT) { + (*bufs)[1].base = message->install_snapshot.data.base; + (*bufs)[1].len = message->install_snapshot.data.len; + } + + return 0; + +oom_after_header_alloc: + raft_free(header.base); + +oom: + return RAFT_NOMEM; +} + +void uvEncodeBatchHeader(const struct raft_entry *entries, + unsigned n, + void *buf) +{ + unsigned i; + void *cursor = buf; + + /* Number of entries in the batch, little endian */ + bytePut64(&cursor, n); + + for (i = 0; i < n; i++) { + const struct raft_entry *entry = &entries[i]; + + /* Term in which the entry was created, little endian. */ + bytePut64(&cursor, entry->term); + + /* Message type (Either RAFT_COMMAND or RAFT_CHANGE) */ + bytePut8(&cursor, (uint8_t)entry->type); + + cursor = (uint8_t *)cursor + 3; /* Unused */ + + /* Size of the log entry data, little endian. */ + bytePut32(&cursor, (uint32_t)entry->buf.len); + } +} + +static void decodeRequestVote(const uv_buf_t *buf, struct raft_request_vote *p) +{ + const void *cursor; + + cursor = buf->base; + + p->version = 1; + p->term = byteGet64(&cursor); + p->candidate_id = byteGet64(&cursor); + p->last_log_index = byteGet64(&cursor); + p->last_log_term = byteGet64(&cursor); + + /* Support for legacy request vote that doesn't have disrupt_leader. */ + if (buf->len == sizeofRequestVoteV1()) { + p->disrupt_leader = false; + p->pre_vote = false; + } else { + p->version = 2; + uint64_t flags = byteGet64(&cursor); + p->disrupt_leader = (bool)(flags & 1 << 0); + p->pre_vote = (bool)(flags & 1 << 1); + } +} + +static void decodeRequestVoteResult(const uv_buf_t *buf, + struct raft_request_vote_result *p) +{ + const void *cursor; + + cursor = buf->base; + + p->version = 1; + p->term = byteGet64(&cursor); + p->vote_granted = byteGet64(&cursor); + + if (buf->len > sizeofRequestVoteResultV1()) { + p->version = 2; + uint64_t flags = byteGet64(&cursor); + p->pre_vote = (flags & (1 << 0)); + } +} + +int uvDecodeBatchHeader(const void *batch, + struct raft_entry **entries, + unsigned *n) +{ + const void *cursor = batch; + size_t i; + int rv; + + *n = (unsigned)byteGet64(&cursor); + + if (*n == 0) { + *entries = NULL; + return 0; + } + + *entries = raft_malloc(*n * sizeof **entries); + + if (*entries == NULL) { + rv = RAFT_NOMEM; + goto err; + } + + for (i = 0; i < *n; i++) { + struct raft_entry *entry = &(*entries)[i]; + + entry->term = byteGet64(&cursor); + entry->type = byteGet8(&cursor); + + if (entry->type != RAFT_COMMAND && + entry->type != RAFT_BARRIER && entry->type != RAFT_CHANGE) { + rv = RAFT_MALFORMED; + goto err_after_alloc; + } + + cursor = (uint8_t *)cursor + 3; /* Unused */ + + /* Size of the log entry data, little endian. */ + entry->buf.len = byteGet32(&cursor); + } + + return 0; + +err_after_alloc: + raft_free(*entries); + *entries = NULL; + +err: + assert(rv != 0); + + return rv; +} + +static int decodeAppendEntries(const uv_buf_t *buf, + struct raft_append_entries *args) +{ + const void *cursor; + int rv; + + assert(buf != NULL); + assert(args != NULL); + + cursor = buf->base; + + args->version = 0; + args->term = byteGet64(&cursor); + args->prev_log_index = byteGet64(&cursor); + args->prev_log_term = byteGet64(&cursor); + args->leader_commit = byteGet64(&cursor); + + rv = uvDecodeBatchHeader(cursor, &args->entries, &args->n_entries); + if (rv != 0) { + return rv; + } + + return 0; +} + +static void decodeAppendEntriesResult(const uv_buf_t *buf, + struct raft_append_entries_result *p) +{ + const void *cursor; + + cursor = buf->base; + + p->version = 0; + p->term = byteGet64(&cursor); + p->rejected = byteGet64(&cursor); + p->last_log_index = byteGet64(&cursor); + p->features = 0; + if (buf->len > sizeofAppendEntriesResultV0()) { + p->version = 1; + p->features = byteGet64(&cursor); + } +} + +static int decodeInstallSnapshot(const uv_buf_t *buf, + struct raft_install_snapshot *args) +{ + const void *cursor; + struct raft_buffer conf; + int rv; + + assert(buf != NULL); + assert(args != NULL); + + cursor = buf->base; + + args->version = 0; + args->term = byteGet64(&cursor); + args->last_index = byteGet64(&cursor); + args->last_term = byteGet64(&cursor); + args->conf_index = byteGet64(&cursor); + conf.len = (size_t)byteGet64(&cursor); + conf.base = (void *)cursor; + + rv = configurationDecode(&conf, &args->conf); + if (rv != 0) { + return rv; + } + cursor = (uint8_t *)cursor + conf.len; + args->data.len = (size_t)byteGet64(&cursor); + + return 0; +} + +static void decodeTimeoutNow(const uv_buf_t *buf, struct raft_timeout_now *p) +{ + const void *cursor; + + cursor = buf->base; + + p->version = 0; + p->term = byteGet64(&cursor); + p->last_log_index = byteGet64(&cursor); + p->last_log_term = byteGet64(&cursor); +} + +int uvDecodeMessage(uint16_t type, + const uv_buf_t *header, + struct raft_message *message, + size_t *payload_len) +{ + unsigned i; + int rv = 0; + + memset(message, 0, sizeof(*message)); + message->type = (unsigned short)type; + + *payload_len = 0; + + /* Decode the header. */ + switch (type) { + case RAFT_IO_REQUEST_VOTE: + decodeRequestVote(header, &message->request_vote); + break; + case RAFT_IO_REQUEST_VOTE_RESULT: + decodeRequestVoteResult(header, + &message->request_vote_result); + break; + case RAFT_IO_APPEND_ENTRIES: + rv = decodeAppendEntries(header, + &message->append_entries); + for (i = 0; i < message->append_entries.n_entries; + i++) { + *payload_len += + message->append_entries.entries[i].buf.len; + } + break; + case RAFT_IO_APPEND_ENTRIES_RESULT: + decodeAppendEntriesResult( + header, &message->append_entries_result); + break; + case RAFT_IO_INSTALL_SNAPSHOT: + rv = decodeInstallSnapshot(header, + &message->install_snapshot); + *payload_len += message->install_snapshot.data.len; + break; + case RAFT_IO_TIMEOUT_NOW: + decodeTimeoutNow(header, &message->timeout_now); + break; + default: + rv = RAFT_IOERR; + break; + }; + + return rv; +} + +void uvDecodeEntriesBatch(uint8_t *batch, + size_t offset, + struct raft_entry *entries, + unsigned n) +{ + uint8_t *cursor; + size_t i; + + assert(batch != NULL); + + cursor = batch + offset; + + for (i = 0; i < n; i++) { + struct raft_entry *entry = &entries[i]; + entry->batch = batch; + + if (entry->buf.len == 0) { + entry->buf.base = NULL; + continue; + } + + entry->buf.base = cursor; + + cursor = cursor + entry->buf.len; + if (entry->buf.len % 8 != 0) { + /* Add padding */ + cursor = cursor + 8 - (entry->buf.len % 8); + } + } +} diff --git a/src/raft/uv_encoding.h b/src/raft/uv_encoding.h new file mode 100644 index 000000000..e0c2626e1 --- /dev/null +++ b/src/raft/uv_encoding.h @@ -0,0 +1,59 @@ +/* Encoding routines for the the libuv-based @raft_io backend. */ + +#ifndef UV_ENCODING_H_ +#define UV_ENCODING_H_ + +#include + +#include "../raft.h" + +/* Current disk format version. */ +#define UV__DISK_FORMAT 1 + +int uvEncodeMessage(const struct raft_message *message, + uv_buf_t **bufs, + unsigned *n_bufs); + +int uvDecodeMessage(uint16_t type, + const uv_buf_t *header, + struct raft_message *message, + size_t *payload_len); + +int uvDecodeBatchHeader(const void *batch, + struct raft_entry **entries, + unsigned *n); + +void uvDecodeEntriesBatch(uint8_t *batch, + size_t offset, + struct raft_entry *entries, + unsigned n); + +/** + * The layout of the memory pointed at by a @batch pointer is the following: + * + * [8 bytes] Number of entries in the batch, little endian. + * [header1] Header data of the first entry of the batch. + * [ ... ] More headers + * [headerN] Header data of the last entry of the batch. + * [data1 ] Payload data of the first entry of the batch. + * [ ... ] More data + * [dataN ] Payload data of the last entry of the batch. + * + * An entry header is 16-byte long and has the following layout: + * + * [8 bytes] Term in which the entry was created, little endian. + * [1 byte ] Message type (Either RAFT_COMMAND or RAFT_CHANGE) + * [3 bytes] Currently unused. + * [4 bytes] Size of the log entry data, little endian. + * + * A payload data section for an entry is simply a sequence of bytes of + * arbitrary lengths, possibly padded with extra bytes to reach 8-byte boundary + * (which means that all entry data pointers are 8-byte aligned). + */ +size_t uvSizeofBatchHeader(size_t n); + +void uvEncodeBatchHeader(const struct raft_entry *entries, + unsigned n, + void *buf); + +#endif /* UV_ENCODING_H_ */ diff --git a/src/raft/uv_finalize.c b/src/raft/uv_finalize.c new file mode 100644 index 000000000..638b551ad --- /dev/null +++ b/src/raft/uv_finalize.c @@ -0,0 +1,176 @@ +#include "assert.h" +#include "heap.h" +#include "queue.h" +#include "uv.h" +#include "uv_os.h" + +/* Metadata about an open segment not used anymore and that should be closed or + * remove (if not written at all). */ +struct uvDyingSegment +{ + struct uv *uv; + uvCounter counter; /* Segment counter */ + size_t used; /* Number of used bytes */ + raft_index first_index; /* Index of first entry */ + raft_index last_index; /* Index of last entry */ + int status; /* Status code of blocking syscalls */ + queue queue; /* Link to finalize queue */ +}; + +/* Run all blocking syscalls involved in closing a used open segment. + * + * An open segment is closed by truncating its length to the number of bytes + * that were actually written into it and then renaming it. */ +static void uvFinalizeWorkCb(uv_work_t *work) +{ + struct uvDyingSegment *segment = work->data; + struct uv *uv = segment->uv; + char filename1[UV__FILENAME_LEN]; + char filename2[UV__FILENAME_LEN]; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int rv; + + sprintf(filename1, UV__OPEN_TEMPLATE, segment->counter); + sprintf(filename2, UV__CLOSED_TEMPLATE, segment->first_index, + segment->last_index); + + tracef("finalize %s into %s", filename1, filename2); + + /* If the segment hasn't actually been used (because the writer has been + * closed or aborted before making any write), just remove it. */ + if (segment->used == 0) { + tracef("remove unused segment file: %s", filename1); + rv = UvFsRemoveFile(uv->dir, filename1, errmsg); + if (rv != 0) { + goto err; + } + goto sync; + } + + /* Truncate and rename the segment.*/ + rv = UvFsTruncateAndRenameFile(uv->dir, segment->used, filename1, + filename2, errmsg); + if (rv != 0) { + goto err; + } + +sync: + rv = UvFsSyncDir(uv->dir, errmsg); + if (rv != 0) { + goto err; + } + + segment->status = 0; + return; + +err: + tracef("truncate segment %s: %s", filename1, errmsg); + assert(rv != 0); + segment->status = rv; +} + +static int uvFinalizeStart(struct uvDyingSegment *segment); +static void uvFinalizeAfterWorkCb(uv_work_t *work, int status) +{ + struct uvDyingSegment *segment = work->data; + struct uv *uv = segment->uv; + tracef("uv finalize after work segment %p cb status:%d", + (void *)segment, status); + queue *head; + int rv; + + assert(status == 0); /* We don't cancel worker requests */ + uv->finalize_work.data = NULL; + if (segment->status != 0) { + uv->errored = true; + } + RaftHeapFree(segment); + + /* If we have no more dismissed segments to close, check if there's a + * barrier to unblock or if we are done closing. */ + if (QUEUE_IS_EMPTY(&uv->finalize_reqs)) { + tracef("unblock barrier or close"); + if (uv->barrier != NULL && UvBarrierReady(uv)) { + UvBarrierMaybeTrigger(uv->barrier); + } + uvMaybeFireCloseCb(uv); + return; + } + + /* Grab a new dismissed segment to close. */ + head = QUEUE_HEAD(&uv->finalize_reqs); + segment = QUEUE_DATA(head, struct uvDyingSegment, queue); + QUEUE_REMOVE(&segment->queue); + + rv = uvFinalizeStart(segment); + if (rv != 0) { + RaftHeapFree(segment); + uv->errored = true; + } +} + +/* Start finalizing an open segment. */ +static int uvFinalizeStart(struct uvDyingSegment *segment) +{ + struct uv *uv = segment->uv; + int rv; + + assert(uv->finalize_work.data == NULL); + assert(segment->counter > 0); + + uv->finalize_work.data = segment; + + rv = uv_queue_work(uv->loop, &uv->finalize_work, uvFinalizeWorkCb, + uvFinalizeAfterWorkCb); + if (rv != 0) { + ErrMsgPrintf(uv->io->errmsg, + "start to truncate segment file %llu: %s", + segment->counter, uv_strerror(rv)); + return RAFT_IOERR; + } + + return 0; +} + +int UvFinalize(struct uv *uv, + unsigned long long counter, + size_t used, + raft_index first_index, + raft_index last_index) +{ + struct uvDyingSegment *segment; + int rv; + + if (used > 0) { + assert(first_index > 0); + assert(last_index >= first_index); + } + + segment = RaftHeapMalloc(sizeof *segment); + if (segment == NULL) { + return RAFT_NOMEM; + } + + segment->uv = uv; + segment->counter = counter; + segment->used = used; + segment->first_index = first_index; + segment->last_index = last_index; + + /* If we're already processing a segment, let's put the request in the + * queue and wait. */ + if (uv->finalize_work.data != NULL) { + QUEUE_PUSH(&uv->finalize_reqs, &segment->queue); + return 0; + } + + rv = uvFinalizeStart(segment); + if (rv != 0) { + RaftHeapFree(segment); + return rv; + } + + return 0; +} + +#undef tracef diff --git a/src/raft/uv_fs.c b/src/raft/uv_fs.c new file mode 100644 index 000000000..d28ac9eec --- /dev/null +++ b/src/raft/uv_fs.c @@ -0,0 +1,933 @@ +#include "uv_fs.h" + +#include +#include +#include +#include + +#include "assert.h" +#include "compress.h" +#include "err.h" +#include "heap.h" +#include "uv_os.h" + +int UvFsCheckDir(const char *dir, char *errmsg) +{ + struct uv_fs_s req; + int rv; + + /* Make sure we have a directory we can write into. */ + rv = uv_fs_stat(NULL, &req, dir, NULL); + if (rv != 0) { + switch (rv) { + case UV_ENOENT: + ErrMsgPrintf((char *)errmsg, + "directory '%s' does not exist", + dir); + return RAFT_NOTFOUND; + case UV_EACCES: + ErrMsgPrintf((char *)errmsg, + "can't access directory '%s'", + dir); + return RAFT_UNAUTHORIZED; + case UV_ENOTDIR: + ErrMsgPrintf((char *)errmsg, + "path '%s' is not a directory", + dir); + return RAFT_INVALID; + } + ErrMsgPrintf((char *)errmsg, "can't stat '%s': %s", dir, + uv_strerror(rv)); + return RAFT_IOERR; + } + + if (!(req.statbuf.st_mode & S_IFDIR)) { + ErrMsgPrintf((char *)errmsg, "path '%s' is not a directory", + dir); + return RAFT_INVALID; + } + + if (!(req.statbuf.st_mode & S_IWRITE)) { + ErrMsgPrintf((char *)errmsg, "directory '%s' is not writable", + dir); + return RAFT_INVALID; + } + + return 0; +} + +int UvFsSyncDir(const char *dir, char *errmsg) +{ + uv_file fd; + int rv; + rv = UvOsOpen(dir, UV_FS_O_RDONLY | UV_FS_O_DIRECTORY, 0, &fd); + if (rv != 0) { + UvOsErrMsg(errmsg, "open directory", rv); + return RAFT_IOERR; + } + rv = UvOsFsync(fd); + UvOsClose(fd); + if (rv != 0) { + UvOsErrMsg(errmsg, "fsync directory", rv); + return RAFT_IOERR; + } + return 0; +} + +int UvFsFileExists(const char *dir, + const char *filename, + bool *exists, + char *errmsg) +{ + uv_stat_t sb; + char path[UV__PATH_SZ]; + int rv; + + rv = UvOsJoin(dir, filename, path); + if (rv != 0) { + return RAFT_INVALID; + } + + rv = UvOsStat(path, &sb); + if (rv != 0) { + if (rv == UV_ENOENT) { + *exists = false; + goto out; + } + UvOsErrMsg(errmsg, "stat", rv); + return RAFT_IOERR; + } + + *exists = true; + +out: + return 0; +} + +/* Get the size of the given file. */ +int UvFsFileSize(const char *dir, + const char *filename, + off_t *size, + char *errmsg) +{ + uv_stat_t sb; + char path[UV__PATH_SZ]; + int rv; + + rv = UvOsJoin(dir, filename, path); + if (rv != 0) { + return RAFT_INVALID; + } + + rv = UvOsStat(path, &sb); + if (rv != 0) { + UvOsErrMsg(errmsg, "stat", rv); + return RAFT_IOERR; + } + *size = (off_t)sb.st_size; + + return 0; +} + +int UvFsFileIsEmpty(const char *dir, + const char *filename, + bool *empty, + char *errmsg) +{ + off_t size; + int rv; + + rv = UvFsFileSize(dir, filename, &size, errmsg); + if (rv != 0) { + return rv; + } + *empty = size == 0 ? true : false; + return 0; +} + +/* Open a file in a directory. */ +static int uvFsOpenFile(const char *dir, + const char *filename, + int flags, + int mode, + uv_file *fd, + char *errmsg) +{ + char path[UV__PATH_SZ]; + int rv; + rv = UvOsJoin(dir, filename, path); + if (rv != 0) { + return RAFT_INVALID; + } + rv = UvOsOpen(path, flags, mode, fd); + if (rv != 0) { + UvOsErrMsg(errmsg, "open", rv); + return RAFT_IOERR; + } + return 0; +} + +int UvFsOpenFileForReading(const char *dir, + const char *filename, + uv_file *fd, + char *errmsg) +{ + char path[UV__PATH_SZ]; + int flags = O_RDONLY; + int rv; + + rv = UvOsJoin(dir, filename, path); + if (rv != 0) { + return RAFT_INVALID; + } + + return uvFsOpenFile(dir, filename, flags, 0, fd, errmsg); +} + +int UvFsAllocateFile(const char *dir, + const char *filename, + size_t size, + uv_file *fd, + bool fallocate, + char *errmsg) +{ + char path[UV__PATH_SZ]; + int flags = O_WRONLY | O_CREAT | O_EXCL; /* Common open flags */ + int rv = 0; + + rv = UvOsJoin(dir, filename, path); + if (rv != 0) { + return RAFT_INVALID; + } + + /* Allocate the desired size. */ + if (fallocate) { + /* TODO: use RWF_DSYNC instead, if available. */ + flags |= O_DSYNC; + rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd, + errmsg); + if (rv != 0) { + goto err; + } + rv = UvOsFallocate(*fd, 0, (off_t)size); + if (rv == 0) { + return 0; + } else if (rv == UV_ENOSPC) { + ErrMsgPrintf(errmsg, + "not enough space to allocate %zu bytes", + size); + rv = RAFT_NOSPACE; + goto err_after_open; + } else { + UvOsErrMsg(errmsg, "posix_allocate", rv); + rv = RAFT_IOERR; + goto err_after_open; + } + } else { + /* Emulate fallocate, open without O_DSYNC, because we risk + * doing a lot of synced writes. */ + rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd, + errmsg); + if (rv != 0) { + goto err; + } + rv = UvOsFallocateEmulation(*fd, 0, (off_t)size); + if (rv == UV_ENOSPC) { + ErrMsgPrintf(errmsg, + "not enough space to allocate %zu bytes", + size); + rv = RAFT_NOSPACE; + goto err_after_open; + } else if (rv != 0) { + ErrMsgPrintf(errmsg, "fallocate emulation %d", rv); + rv = RAFT_IOERR; + goto err_after_open; + } + rv = UvOsFsync(*fd); + if (rv != 0) { + ErrMsgPrintf(errmsg, "fsync %d", rv); + rv = RAFT_IOERR; + goto err_after_open; + } + /* Now close and reopen the file with O_DSYNC */ + rv = UvOsClose(*fd); + if (rv != 0) { + ErrMsgPrintf(errmsg, "close %d", rv); + goto err_unlink; + } + /* TODO: use RWF_DSYNC instead, if available. */ + flags = O_WRONLY | O_DSYNC; + rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd, + errmsg); + if (rv != 0) { + goto err_unlink; + } + } + + return 0; + +err_after_open: + UvOsClose(*fd); +err_unlink: + UvOsUnlink(path); +err: + assert(rv != 0); + return rv; +} + +static int uvFsWriteFile(const char *dir, + const char *filename, + int flags, + struct raft_buffer *bufs, + unsigned n_bufs, + char *errmsg) +{ + uv_file fd; + int rv; + size_t size; + unsigned i; + size = 0; + for (i = 0; i < n_bufs; i++) { + size += bufs[i].len; + } + rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, &fd, errmsg); + if (rv != 0) { + goto err; + } + rv = UvOsWrite(fd, (const uv_buf_t *)bufs, n_bufs, 0); + if (rv != (int)(size)) { + if (rv < 0) { + UvOsErrMsg(errmsg, "write", rv); + } else { + ErrMsgPrintf(errmsg, + "short write: %d only bytes written", rv); + } + goto err_after_file_open; + } + rv = UvOsFsync(fd); + if (rv != 0) { + UvOsErrMsg(errmsg, "fsync", rv); + goto err_after_file_open; + } + rv = UvOsClose(fd); + if (rv != 0) { + UvOsErrMsg(errmsg, "close", rv); + goto err; + } + return 0; + +err_after_file_open: + UvOsClose(fd); +err: + return rv; +} + +int UvFsMakeFile(const char *dir, + const char *filename, + struct raft_buffer *bufs, + unsigned n_bufs, + char *errmsg) +{ + int rv; + char tmp_filename[UV__FILENAME_LEN + 1] = {0}; + char path[UV__PATH_SZ] = {0}; + char tmp_path[UV__PATH_SZ] = {0}; + + /* Create a temp file with the given content + * TODO as of libuv 1.34.0, use `uv_fs_mkstemp` */ + size_t sz = sizeof(tmp_filename); + rv = snprintf(tmp_filename, sz, TMP_FILE_FMT, filename); + if (rv < 0 || rv >= (int)sz) { + return rv; + } + int flags = UV_FS_O_WRONLY | UV_FS_O_CREAT | UV_FS_O_EXCL; + rv = uvFsWriteFile(dir, tmp_filename, flags, bufs, n_bufs, errmsg); + if (rv != 0) { + goto err_after_tmp_create; + } + + /* Check if the file exists */ + bool exists = false; + rv = UvFsFileExists(dir, filename, &exists, errmsg); + if (rv != 0) { + goto err_after_tmp_create; + } + if (exists) { + rv = -1; + goto err_after_tmp_create; + } + + /* Rename the temp file. Remark that there is a race between the + * existence check and the rename, there is no `renameat2` equivalent in + * libuv. However, in the current implementation this should pose no + * problems.*/ + rv = UvOsJoin(dir, tmp_filename, tmp_path); + if (rv != 0) { + return RAFT_INVALID; + } + rv = UvOsJoin(dir, filename, path); + if (rv != 0) { + return RAFT_INVALID; + } + rv = UvOsRename(tmp_path, path); + if (rv != 0) { + UvOsErrMsg(errmsg, "rename", rv); + goto err_after_tmp_create; + } + + rv = UvFsSyncDir(dir, errmsg); + if (rv != 0) { + char ignored[RAFT_ERRMSG_BUF_SIZE]; + UvFsRemoveFile(dir, filename, ignored); + return rv; + } + + return 0; + +err_after_tmp_create: + UvFsRemoveFile(dir, tmp_filename, errmsg); + return rv; +} + +int UvFsMakeOrOverwriteFile(const char *dir, + const char *filename, + const struct raft_buffer *buf, + char *errmsg) +{ + char path[UV__PATH_SZ]; + int flags = UV_FS_O_WRONLY; + int mode = 0; + bool exists = true; + uv_file fd; + int rv; + + rv = UvOsJoin(dir, filename, path); + if (rv != 0) { + return RAFT_INVALID; + } + +open: + rv = UvOsOpen(path, flags, mode, &fd); + if (rv != 0) { + if (rv == UV_ENOENT && !(flags & UV_FS_O_CREAT)) { + exists = false; + flags |= UV_FS_O_CREAT; + mode = S_IRUSR | S_IWUSR; + goto open; + } + goto err; + } + + rv = UvOsWrite(fd, (const uv_buf_t *)buf, 1, 0); + if (rv != (int)(buf->len)) { + if (rv < 0) { + UvOsErrMsg(errmsg, "write", rv); + } else { + ErrMsgPrintf(errmsg, + "short write: %d only bytes written", rv); + } + goto err_after_file_open; + } + + if (exists) { + rv = UvOsFdatasync(fd); + if (rv != 0) { + UvOsErrMsg(errmsg, "fsync", rv); + goto err_after_file_open; + } + } else { + rv = UvOsFsync(fd); + if (rv != 0) { + UvOsErrMsg(errmsg, "fsync", rv); + goto err_after_file_open; + } + } + + rv = UvOsClose(fd); + if (rv != 0) { + UvOsErrMsg(errmsg, "close", rv); + goto err; + } + + if (!exists) { + rv = UvFsSyncDir(dir, errmsg); + if (rv != 0) { + goto err; + } + } + + return 0; + +err_after_file_open: + UvOsClose(fd); +err: + return RAFT_IOERR; +} + +int UvFsReadInto(uv_file fd, struct raft_buffer *buf, char *errmsg) +{ + ssize_t rv; + size_t offset = 0; + + /* TODO: use uv_fs_read() */ + while (offset < buf->len) { + rv = read(fd, (char *)buf->base + offset, buf->len - offset); + if (rv == -1) { + UvOsErrMsg(errmsg, "read", -errno); + return RAFT_IOERR; + } + /* EOF. Don't think this is reachable, but just make very sure + * we don't loop forever. */ + if (rv == 0) { + break; + } + assert(rv > 0); + offset += (size_t)rv; + } + if (offset < buf->len) { + ErrMsgPrintf(errmsg, "short read: %zu bytes instead of %zu", + offset, buf->len); + return RAFT_IOERR; + } + return 0; +} + +int UvFsReadFile(const char *dir, + const char *filename, + struct raft_buffer *buf, + char *errmsg) +{ + uv_stat_t sb; + char path[UV__PATH_SZ]; + uv_file fd; + int rv; + + rv = UvOsJoin(dir, filename, path); + if (rv != 0) { + return RAFT_INVALID; + } + + rv = UvOsStat(path, &sb); + if (rv != 0) { + UvOsErrMsg(errmsg, "stat", rv); + rv = RAFT_IOERR; + goto err; + } + + rv = uvFsOpenFile(dir, filename, O_RDONLY, 0, &fd, errmsg); + if (rv != 0) { + goto err; + } + + buf->len = (size_t)sb.st_size; + buf->base = RaftHeapMalloc(buf->len); + if (buf->base == NULL) { + ErrMsgOom(errmsg); + rv = RAFT_NOMEM; + goto err_after_open; + } + + rv = UvFsReadInto(fd, buf, errmsg); + if (rv != 0) { + goto err_after_buf_alloc; + } + + UvOsClose(fd); + + return 0; + +err_after_buf_alloc: + RaftHeapFree(buf->base); +err_after_open: + UvOsClose(fd); +err: + return rv; +} + +int UvFsReadFileInto(const char *dir, + const char *filename, + struct raft_buffer *buf, + char *errmsg) +{ + char path[UV__PATH_SZ]; + uv_file fd; + int rv; + + rv = UvOsJoin(dir, filename, path); + if (rv != 0) { + return RAFT_INVALID; + } + + rv = uvFsOpenFile(dir, filename, O_RDONLY, 0, &fd, errmsg); + if (rv != 0) { + goto err; + } + + rv = UvFsReadInto(fd, buf, errmsg); + if (rv != 0) { + goto err_after_open; + } + + UvOsClose(fd); + + return 0; + +err_after_open: + UvOsClose(fd); +err: + return rv; +} + +int UvFsRemoveFile(const char *dir, const char *filename, char *errmsg) +{ + char path[UV__PATH_SZ]; + int rv; + rv = UvOsJoin(dir, filename, path); + if (rv != 0) { + return RAFT_INVALID; + } + rv = UvOsUnlink(path); + if (rv != 0) { + UvOsErrMsg(errmsg, "unlink", rv); + return RAFT_IOERR; + } + return 0; +} + +int UvFsRenameFile(const char *dir, + const char *filename1, + const char *filename2, + char *errmsg) +{ + char path1[UV__PATH_SZ]; + char path2[UV__PATH_SZ]; + int rv; + + rv = UvOsJoin(dir, filename1, path1); + if (rv != 0) { + return RAFT_INVALID; + } + rv = UvOsJoin(dir, filename2, path2); + if (rv != 0) { + return RAFT_INVALID; + } + + rv = UvOsRename(path1, path2); + if (rv != 0) { + UvOsErrMsg(errmsg, "rename", rv); + return rv; + } + + return 0; +} + +int UvFsTruncateAndRenameFile(const char *dir, + size_t size, + const char *filename1, + const char *filename2, + char *errmsg) +{ + char path1[UV__PATH_SZ]; + char path2[UV__PATH_SZ]; + uv_file fd; + int rv; + + rv = UvOsJoin(dir, filename1, path1); + if (rv != 0) { + return RAFT_INVALID; + } + rv = UvOsJoin(dir, filename2, path2); + if (rv != 0) { + return RAFT_INVALID; + } + + /* Truncate and rename. */ + rv = UvOsOpen(path1, UV_FS_O_RDWR, 0, &fd); + if (rv != 0) { + UvOsErrMsg(errmsg, "open", rv); + goto err; + } + rv = UvOsTruncate(fd, (off_t)size); + if (rv != 0) { + UvOsErrMsg(errmsg, "truncate", rv); + goto err_after_open; + } + rv = UvOsFsync(fd); + if (rv != 0) { + UvOsErrMsg(errmsg, "fsync", rv); + goto err_after_open; + } + UvOsClose(fd); + + rv = UvOsRename(path1, path2); + if (rv != 0) { + UvOsErrMsg(errmsg, "rename", rv); + goto err; + } + + return 0; + +err_after_open: + UvOsClose(fd); +err: + return RAFT_IOERR; +} + +/* Check if direct I/O is possible on the given fd. */ +static int probeDirectIO(int fd, size_t *size, char *errmsg) +{ + struct statfs fs_info; /* To check the file system type. */ + void *buf; /* Buffer to use for the probe write. */ + int rv; + + rv = UvOsSetDirectIo(fd); + if (rv != 0) { + if (rv != UV_EINVAL) { + /* UNTESTED: the parameters are ok, so this should never + * happen. */ + UvOsErrMsg(errmsg, "fnctl", rv); + return RAFT_IOERR; + } + rv = fstatfs(fd, &fs_info); + if (rv == -1) { + /* UNTESTED: in practice ENOMEM should be the only + * failure mode */ + UvOsErrMsg(errmsg, "fstatfs", -errno); + return RAFT_IOERR; + } + switch (fs_info.f_type) { + case 0x01021994: /* TMPFS_MAGIC */ + case 0x2fc12fc1: /* ZFS magic */ + case 0x24051905: /* UBIFS Support magic */ + *size = 0; + return 0; + default: + /* UNTESTED: this is an unsupported file system. + */ +#if defined(__s390x__) + ErrMsgPrintf(errmsg, + "unsupported file system: %ux", + fs_info.f_type); +#else + ErrMsgPrintf(errmsg, + "unsupported file system: %zx", + fs_info.f_type); +#endif + return RAFT_IOERR; + } + } + + /* Try to perform direct I/O, using various buffer size. */ + *size = 4096; + while (*size >= 512) { + buf = raft_aligned_alloc(*size, *size); + if (buf == NULL) { + ErrMsgOom(errmsg); + return RAFT_NOMEM; + } + memset(buf, 0, *size); + rv = (int)write(fd, buf, *size); + raft_aligned_free(*size, buf); + if (rv > 0) { + /* Since we fallocate'ed the file, we should never fail + * because of lack of disk space, and all bytes should + * have been written. */ + assert(rv == (int)(*size)); + return 0; + } + assert(rv == -1); + if (errno != EIO && errno != EOPNOTSUPP) { + /* UNTESTED: this should basically fail only because of + * disk errors, since we allocated the file with + * posix_fallocate. */ + + /* FIXME: this is a workaround because shiftfs doesn't + * return EINVAL in the fnctl call above, for example + * when the underlying fs is ZFS. */ + if (errno == EINVAL && *size == 4096) { + *size = 0; + return 0; + } + + UvOsErrMsg(errmsg, "write", -errno); + return RAFT_IOERR; + } + *size = *size / 2; + } + + *size = 0; + return 0; +} + +/* Check if fully non-blocking async I/O is possible on the given fd. */ +static int probeAsyncIO(int fd, size_t size, bool *ok, char *errmsg) +{ + void *buf; /* Buffer to use for the probe write */ + aio_context_t ctx = 0; /* KAIO context handle */ + struct iocb iocb; /* KAIO request object */ + struct iocb *iocbs = &iocb; /* Because the io_submit() API sucks */ + struct io_event event; /* KAIO response object */ + int n_events; + int rv; + + /* Setup the KAIO context handle */ + rv = UvOsIoSetup(1, &ctx); + if (rv != 0) { + UvOsErrMsg(errmsg, "io_setup", rv); + /* UNTESTED: in practice this should fail only with ENOMEM */ + return RAFT_IOERR; + } + + /* Allocate the write buffer */ + buf = raft_aligned_alloc(size, size); + if (buf == NULL) { + ErrMsgOom(errmsg); + return RAFT_NOMEM; + } + memset(buf, 0, size); + + /* Prepare the KAIO request object */ + memset(&iocb, 0, sizeof iocb); + iocb.aio_lio_opcode = IOCB_CMD_PWRITE; + *((void **)(&iocb.aio_buf)) = buf; + iocb.aio_nbytes = size; + iocb.aio_offset = 0; + iocb.aio_fildes = (uint32_t)fd; + iocb.aio_reqprio = 0; + iocb.aio_rw_flags |= RWF_NOWAIT | RWF_DSYNC; + + /* Submit the KAIO request */ + rv = UvOsIoSubmit(ctx, 1, &iocbs); + if (rv != 0) { + /* UNTESTED: in practice this should fail only with ENOMEM */ + raft_aligned_free(size, buf); + UvOsIoDestroy(ctx); + /* On ZFS 0.8 this is not properly supported yet. Also, when + * running on older kernels a binary compiled on a kernel with + * RWF_NOWAIT support, we might get EINVAL. */ + if (errno == EOPNOTSUPP || errno == EINVAL) { + *ok = false; + return 0; + } + UvOsErrMsg(errmsg, "io_submit", rv); + return RAFT_IOERR; + } + + /* Fetch the response: will block until done. */ + n_events = UvOsIoGetevents(ctx, 1, 1, &event, NULL); + assert(n_events == 1); + if (n_events != 1) { + /* UNTESTED */ + UvOsErrMsg(errmsg, "UvOsIoGetevents", n_events); + return RAFT_IOERR; + } + + /* Release the write buffer. */ + raft_aligned_free(size, buf); + + /* Release the KAIO context handle. */ + rv = UvOsIoDestroy(ctx); + if (rv != 0) { + UvOsErrMsg(errmsg, "io_destroy", rv); + return RAFT_IOERR; + } + + if (event.res > 0) { + assert(event.res == (int)size); + *ok = true; + } else { + /* UNTESTED: this should basically fail only because of disk + * errors, since we allocated the file with posix_fallocate and + * the block size is supposed to be correct. */ + *ok = false; + if (event.res == -EAGAIN) { + /* If EAGAIN is encountered we assume the functionality + * is supported but this write would have blocked for + * some reason. UvWriter has a fallback mechanism to + * schedule writes on the thread pool in case the async + * write fails with EAGAIN, so this is safe. */ + *ok = true; + } + } + + return 0; +} + +#define UV__FS_PROBE_FALLOCATE_FILE ".probe_fallocate" +/* Leave detection of other error conditions to other probe* functions, only + * bother checking if posix_fallocate returns success. */ +static void probeFallocate(const char *dir, bool *fallocate) +{ + int flags = O_WRONLY | O_CREAT | O_EXCL; /* Common open flags */ + char ignored[RAFT_ERRMSG_BUF_SIZE]; + int rv = 0; + int fd = -1; + + *fallocate = false; + UvFsRemoveFile(dir, UV__FS_PROBE_FALLOCATE_FILE, ignored); + rv = uvFsOpenFile(dir, UV__FS_PROBE_FALLOCATE_FILE, flags, + S_IRUSR | S_IWUSR, &fd, ignored); + if (rv != 0) { + goto out; + } + rv = UvOsFallocate(fd, 0, (off_t)4096); + if (rv == 0) { + *fallocate = true; + } + +out: + UvFsRemoveFile(dir, UV__FS_PROBE_FALLOCATE_FILE, ignored); +} + +#define UV__FS_PROBE_FILE ".probe" +#define UV__FS_PROBE_FILE_SIZE 4096 +int UvFsProbeCapabilities(const char *dir, + size_t *direct, + bool *async, + bool *fallocate, + char *errmsg) +{ + int fd; /* File descriptor of the probe file */ + int rv; + char ignored[RAFT_ERRMSG_BUF_SIZE]; + + probeFallocate(dir, fallocate); + + /* Create a temporary probe file. */ + UvFsRemoveFile(dir, UV__FS_PROBE_FILE, ignored); + rv = UvFsAllocateFile(dir, UV__FS_PROBE_FILE, UV__FS_PROBE_FILE_SIZE, + &fd, *fallocate, errmsg); + if (rv != 0) { + ErrMsgWrapf(errmsg, "create I/O capabilities probe file"); + goto err; + } + UvFsRemoveFile(dir, UV__FS_PROBE_FILE, ignored); + + /* Check if we can use direct I/O. */ + rv = probeDirectIO(fd, direct, errmsg); + if (rv != 0) { + ErrMsgWrapf(errmsg, "probe Direct I/O"); + goto err_after_file_open; + } + + /* If direct I/O is not possible, we can't perform fully asynchronous + * I/O, because io_submit might potentially block. */ + if (*direct == 0) { + *async = false; + goto out; + } + rv = probeAsyncIO(fd, *direct, async, errmsg); + if (rv != 0) { + ErrMsgWrapf(errmsg, "probe Async I/O"); + goto err_after_file_open; + } + +out: + close(fd); + return 0; + +err_after_file_open: + close(fd); +err: + return rv; +} diff --git a/src/raft/uv_fs.h b/src/raft/uv_fs.h new file mode 100644 index 000000000..8e8159f9d --- /dev/null +++ b/src/raft/uv_fs.h @@ -0,0 +1,121 @@ +/* File system related utilities. */ + +#ifndef UV_FS_H_ +#define UV_FS_H_ + +#include + +#include "../raft.h" +#include "err.h" + +#define TMP_FILE_PREFIX "tmp-" +#define TMP_FILE_FMT TMP_FILE_PREFIX "%s" + +/* Check that the given directory can be used. */ +int UvFsCheckDir(const char *dir, char *errmsg); + +/* Sync the given directory by calling fsync(). */ +int UvFsSyncDir(const char *dir, char *errmsg); + +/* Check whether a the given file exists. */ +int UvFsFileExists(const char *dir, + const char *filename, + bool *exists, + char *errmsg); + +/* Get the size of the given file. */ +int UvFsFileSize(const char *dir, + const char *filename, + off_t *size, + char *errmsg); + +/* Check whether the given file in the given directory is empty. */ +int UvFsFileIsEmpty(const char *dir, + const char *filename, + bool *empty, + char *errmsg); + +/* Create the given file in the given directory and allocate the given size to + * it, returning its file descriptor. The file must not exist yet. */ +int UvFsAllocateFile(const char *dir, + const char *filename, + size_t size, + uv_file *fd, + bool fallocate, + char *errmsg); + +/* Create a file and write the given content into it. */ +int UvFsMakeFile(const char *dir, + const char *filename, + struct raft_buffer *bufs, + unsigned n_bufs, + char *errmsg); + +/* Create or overwrite a file. + * + * If the file does not exists yet, it gets created, the given content written + * to it, and then fully persisted to disk by fsync()'ing the file and the + * dir. + * + * If the file already exists, it gets overwritten. The assumption is that the + * file size will stay the same and its content will change, so only fdatasync() + * will be used */ +int UvFsMakeOrOverwriteFile(const char *dir, + const char *filename, + const struct raft_buffer *buf, + char *errmsg); + +/* Open a file for reading. */ +int UvFsOpenFileForReading(const char *dir, + const char *filename, + uv_file *fd, + char *errmsg); + +/* Read exactly buf->len bytes from the given file descriptor into + buf->base. Fail if less than buf->len bytes are read. */ +int UvFsReadInto(uv_file fd, struct raft_buffer *buf, char *errmsg); + +/* Read all the content of the given file. */ +int UvFsReadFile(const char *dir, + const char *filename, + struct raft_buffer *buf, + char *errmsg); + +/* Read exactly buf->len bytes from the given file into buf->base. Fail if less + * than buf->len bytes are read. */ +int UvFsReadFileInto(const char *dir, + const char *filename, + struct raft_buffer *buf, + char *errmsg); + +/* Synchronously remove a file, calling the unlink() system call. */ +int UvFsRemoveFile(const char *dir, const char *filename, char *errmsg); + +/* Synchronously truncate a file to the given size and then rename it. */ +int UvFsTruncateAndRenameFile(const char *dir, + size_t size, + const char *filename1, + const char *filename2, + char *errmsg); + +/* Synchronously rename a file. */ +int UvFsRenameFile(const char *dir, + const char *filename1, + const char *filename2, + char *errmsg); + +/* Return information about the I/O capabilities of the underlying file + * system. + * + * The @direct parameter will be set to zero if direct I/O is not possible, or + * to the block size to use for direct I/O otherwise. + * + * The @async parameter will be set to true if fully asynchronous I/O is + * possible using the KAIO API. */ +int UvFsProbeCapabilities(const char *dir, + size_t *direct, + bool *async, + bool *fallocate, + char *errmsg); + +#endif /* UV_FS_H_ */ diff --git a/src/raft/uv_ip.c b/src/raft/uv_ip.c new file mode 100644 index 000000000..4e4ff9f3f --- /dev/null +++ b/src/raft/uv_ip.c @@ -0,0 +1,86 @@ +#include +#include + +#include + +#include "../raft.h" + +#include "uv_ip.h" + +static const char *strCpyUntil(char *target, + const char *source, + size_t target_size, + char separator) +{ + size_t i; + for (i = 0; i < target_size; ++i) { + if (!source[i] || source[i] == separator) { + target[i] = 0; + return source + i; + } else { + target[i] = source[i]; + } + } + return NULL; +} + +int uvIpAddrSplit(const char *address, + char *host, + size_t host_size, + char *service, + size_t service_size) +{ + char colon = ':'; + const char *service_ptr = NULL; + + if (host) { + service_ptr = strCpyUntil(host, address, host_size, colon); + if (!service_ptr) { + return RAFT_NAMETOOLONG; + } + } + if (service) { + if (!service_ptr) { + service_ptr = strchr(address, colon); + } + if (!service_ptr || *service_ptr == 0 || + *(++service_ptr) == 0) { + service_ptr = "8080"; + } + if (!strCpyUntil(service, service_ptr, service_size, 0)) { + return RAFT_NAMETOOLONG; + } + } + return 0; +} + +/* Synchronoues resolve hostname to IP address */ +int uvIpResolveBindAddresses(const char *address, struct addrinfo **ai_result) +{ + static struct addrinfo hints = { + .ai_flags = AI_ADDRCONFIG | AI_PASSIVE | AI_NUMERICSERV, + .ai_family = AF_INET, + .ai_socktype = SOCK_STREAM, + .ai_protocol = 0}; + char hostname[NI_MAXHOST]; + char service[NI_MAXSERV]; + int rv; + + rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service, + sizeof(service)); + if (rv != 0) { + return rv; + } + + if (hostname[0]) { + rv = getaddrinfo(hostname, service, &hints, ai_result); + } else { + rv = getaddrinfo(NULL, service, &hints, ai_result); + } + + if (rv != 0) { + return RAFT_IOERR; + } + + return 0; +} diff --git a/src/raft/uv_ip.h b/src/raft/uv_ip.h new file mode 100644 index 000000000..8cda2b91c --- /dev/null +++ b/src/raft/uv_ip.h @@ -0,0 +1,20 @@ +/* IP-related utils. */ + +#ifndef UV_IP_H_ +#define UV_IP_H_ + +#include + +/* Split @address into @host and @service. */ +int uvIpAddrSplit(const char *address, + char *host, + size_t host_size, + char *service, + size_t service_size); + +struct addrinfo; + +/* Synchronous resolve hostname to IP address */ +int uvIpResolveBindAddresses(const char *address, struct addrinfo **ai_result); + +#endif /* UV_IP_H */ diff --git a/src/raft/uv_list.c b/src/raft/uv_list.c new file mode 100644 index 000000000..18639db36 --- /dev/null +++ b/src/raft/uv_list.c @@ -0,0 +1,116 @@ +#include + +#include "assert.h" +#include "uv.h" + +static const char *uvListIgnored[] = {".", "..", "metadata1", "metadata2", + NULL}; + +/* Return true if the given filename should be ignored. */ +static bool uvListShouldIgnore(const char *filename) +{ + const char **cursor = uvListIgnored; + bool result = false; + if (strlen(filename) >= UV__FILENAME_LEN) { + return true; + } + while (*cursor != NULL) { + if (strcmp(filename, *cursor) == 0) { + result = true; + break; + } + cursor++; + } + return result; +} + +int UvList(struct uv *uv, + struct uvSnapshotInfo *snapshots[], + size_t *n_snapshots, + struct uvSegmentInfo *segments[], + size_t *n_segments, + char *errmsg) +{ + struct uv_fs_s req; + struct uv_dirent_s entry; + int n; + int i; + int rv; + int rv2; + + n = uv_fs_scandir(NULL, &req, uv->dir, 0, NULL); + if (n < 0) { + ErrMsgPrintf(errmsg, "scan data directory: %s", uv_strerror(n)); + return RAFT_IOERR; + } + + *snapshots = NULL; + *n_snapshots = 0; + + *segments = NULL; + *n_segments = 0; + + rv = 0; + + for (i = 0; i < n; i++) { + const char *filename; + bool appended; + + rv = uv_fs_scandir_next(&req, &entry); + assert(rv == 0); /* Can't fail in libuv */ + + filename = entry.name; + + /* If an error occurred while processing a preceeding entry or + * if we know that this is not a segment filename, just free it + * and skip to the next one. */ + if (rv != 0 || uvListShouldIgnore(filename)) { + if (rv == 0) { + tracef("ignore %s", filename); + } + continue; + } + + /* Append to the snapshot list if it's a snapshot metadata + * filename and a valid associated snapshot file exists. */ + rv = UvSnapshotInfoAppendIfMatch(uv, filename, snapshots, + n_snapshots, &appended); + if (appended || rv != 0) { + if (rv == 0) { + tracef("snapshot %s", filename); + } + continue; + } + + /* Append to the segment list if it's a segment filename */ + rv = uvSegmentInfoAppendIfMatch(entry.name, segments, + n_segments, &appended); + if (appended || rv != 0) { + if (rv == 0) { + tracef("segment %s", filename); + } + continue; + } + + tracef("ignore %s", filename); + } + + rv2 = uv_fs_scandir_next(&req, &entry); + assert(rv2 == UV_EOF); + + if (rv != 0 && *segments != NULL) { + raft_free(*segments); + } + + if (*snapshots != NULL) { + UvSnapshotSort(*snapshots, *n_snapshots); + } + + if (*segments != NULL) { + uvSegmentSort(*segments, *n_segments); + } + + return rv; +} + +#undef tracef diff --git a/src/raft/uv_metadata.c b/src/raft/uv_metadata.c new file mode 100644 index 000000000..aee87323f --- /dev/null +++ b/src/raft/uv_metadata.c @@ -0,0 +1,204 @@ +#include "assert.h" +#include "byte.h" +#include "uv.h" +#include "uv_encoding.h" + +/* We have metadata1 and metadata2. */ +#define METADATA_FILENAME_PREFIX "metadata" +#define METADATA_FILENAME_SIZE (sizeof(METADATA_FILENAME_PREFIX) + 2) + +/* Format, version, term, vote */ +#define METADATA_CONTENT_SIZE (8 * 4) + +/* Encode the content of a metadata file. */ +static void uvMetadataEncode(const struct uvMetadata *metadata, void *buf) +{ + void *cursor = buf; + bytePut64(&cursor, UV__DISK_FORMAT); + bytePut64(&cursor, metadata->version); + bytePut64(&cursor, metadata->term); + bytePut64(&cursor, metadata->voted_for); +} + +/* Decode the content of a metadata file. */ +static int uvMetadataDecode(const void *buf, + struct uvMetadata *metadata, + char *errmsg) +{ + const void *cursor = buf; + uint64_t format; + format = byteGet64(&cursor); + if (format != UV__DISK_FORMAT) { + ErrMsgPrintf(errmsg, "bad format version %ju", format); + return RAFT_MALFORMED; + } + metadata->version = byteGet64(&cursor); + metadata->term = byteGet64(&cursor); + metadata->voted_for = byteGet64(&cursor); + + /* Coherence checks that values make sense */ + if (metadata->version == 0) { + ErrMsgPrintf(errmsg, "version is set to zero"); + return RAFT_CORRUPT; + } + + return 0; +} + +/* Render the filename of the metadata file with index @n. */ +static void uvMetadataFilename(const unsigned short n, char *filename) +{ + sprintf(filename, METADATA_FILENAME_PREFIX "%d", n); +} + +/* Read the n'th metadata file (with n equal to 1 or 2) and decode the content + * of the file, populating the given metadata buffer accordingly. */ +static int uvMetadataLoadN(const char *dir, + const unsigned short n, + struct uvMetadata *metadata, + char *errmsg) +{ + char filename[METADATA_FILENAME_SIZE]; /* Filename of the metadata file + */ + uint8_t content[METADATA_CONTENT_SIZE]; /* Content of metadata file */ + off_t size; + struct raft_buffer buf; + bool exists; + int rv; + + assert(n == 1 || n == 2); + + /* Render the metadata path */ + uvMetadataFilename(n, filename); + + rv = UvFsFileExists(dir, filename, &exists, errmsg); + if (rv != 0) { + ErrMsgWrapf(errmsg, "check if %s exists", filename); + return rv; + } + + memset(metadata, 0, sizeof *metadata); + + /* If the file does not exist, just return. */ + if (!exists) { + return 0; + } + + /* If the file exists but has less bytes than expected assume that the + * server crashed while writing this metadata file, and pretend it has + * not been written at all. If it has more file than expected, return an + * error. */ + rv = UvFsFileSize(dir, filename, &size, errmsg); + if (rv != 0) { + ErrMsgWrapf(errmsg, "check size of %s", filename); + return rv; + } + + if (size != sizeof content) { + if ((size_t)size < sizeof content) { + rv = UvFsRemoveFile(dir, filename, errmsg); + if (rv != 0) { + return rv; + } + return 0; + } + ErrMsgPrintf(errmsg, "%s has size %jd instead of %zu", filename, + (intmax_t)size, sizeof content); + return RAFT_CORRUPT; + } + + /* Read the content of the metadata file. */ + buf.base = content; + buf.len = sizeof content; + + rv = UvFsReadFileInto(dir, filename, &buf, errmsg); + if (rv != 0) { + ErrMsgWrapf(errmsg, "read content of %s", filename); + return rv; + }; + + /* Decode the content of the metadata file. */ + rv = uvMetadataDecode(content, metadata, errmsg); + if (rv != 0) { + ErrMsgWrapf(errmsg, "decode content of %s", filename); + return rv; + } + + return 0; +} + +int uvMetadataLoad(const char *dir, struct uvMetadata *metadata, char *errmsg) +{ + struct uvMetadata metadata1; + struct uvMetadata metadata2; + int rv; + + /* Read the two metadata files (if available). */ + rv = uvMetadataLoadN(dir, 1, &metadata1, errmsg); + if (rv != 0) { + return rv; + } + rv = uvMetadataLoadN(dir, 2, &metadata2, errmsg); + if (rv != 0) { + return rv; + } + + /* Check the versions. */ + if (metadata1.version == 0 && metadata2.version == 0) { + /* Neither metadata file exists: have a brand new server. */ + metadata->version = 0; + metadata->term = 0; + metadata->voted_for = 0; + } else if (metadata1.version == metadata2.version) { + /* The two metadata files can't have the same version. */ + ErrMsgPrintf(errmsg, + "metadata1 and metadata2 are both at version %llu", + metadata1.version); + return RAFT_CORRUPT; + } else { + /* Pick the metadata with the grater version. */ + if (metadata1.version > metadata2.version) { + *metadata = metadata1; + } else { + *metadata = metadata2; + } + } + + return 0; +} + +/* Return the metadata file index associated with the given version. */ +static unsigned short uvMetadataFileIndex(unsigned long long version) +{ + return version % 2 == 1 ? 1 : 2; +} + +int uvMetadataStore(struct uv *uv, const struct uvMetadata *metadata) +{ + char filename[METADATA_FILENAME_SIZE]; /* Filename of the metadata file + */ + uint8_t content[METADATA_CONTENT_SIZE]; /* Content of metadata file */ + struct raft_buffer buf; + unsigned short n; + int rv; + + assert(metadata->version > 0); + + /* Encode the given metadata. */ + uvMetadataEncode(metadata, content); + + /* Render the metadata file name. */ + n = uvMetadataFileIndex(metadata->version); + uvMetadataFilename(n, filename); + + /* Write the metadata file, creating it if it does not exist. */ + buf.base = content; + buf.len = sizeof content; + rv = UvFsMakeOrOverwriteFile(uv->dir, filename, &buf, uv->io->errmsg); + if (rv != 0) { + ErrMsgWrapf(uv->io->errmsg, "persist %s", filename); + return rv; + } + + return 0; +} diff --git a/src/raft/uv_os.c b/src/raft/uv_os.c new file mode 100644 index 000000000..8a96ab130 --- /dev/null +++ b/src/raft/uv_os.c @@ -0,0 +1,222 @@ +#include "uv_os.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "assert.h" +#include "err.h" +#include "syscall.h" + +/* Default permissions when creating a directory. */ +#define DEFAULT_DIR_PERM 0700 + +int UvOsOpen(const char *path, int flags, int mode, uv_file *fd) +{ + struct uv_fs_s req; + int rv; + rv = uv_fs_open(NULL, &req, path, flags, mode, NULL); + if (rv < 0) { + return rv; + } + *fd = rv; + return 0; +} + +int UvOsClose(uv_file fd) +{ + struct uv_fs_s req; + return uv_fs_close(NULL, &req, fd, NULL); +} + +/* Emulate fallocate(). Mostly taken from glibc's implementation. */ +int UvOsFallocateEmulation(int fd, off_t offset, off_t len) +{ + ssize_t increment; + struct statfs f; + int rv; + + rv = fstatfs(fd, &f); + if (rv != 0) { + return -errno; + } + + if (f.f_bsize == 0) { + increment = 512; + } else if (f.f_bsize < 4096) { + increment = f.f_bsize; + } else { + increment = 4096; + } + + for (offset += (len - 1) % increment; len > 0; offset += increment) { + len -= increment; + rv = (int)pwrite(fd, "", 1, offset); + if (rv != 1) { + return -errno; + } + } + + return 0; +} + +int UvOsFallocate(uv_file fd, off_t offset, off_t len) +{ + int rv; + rv = posix_fallocate(fd, offset, len); + if (rv != 0) { + /* From the manual page: + * + * posix_fallocate() returns zero on success, or an error + * number on failure. Note that errno is not set. + */ + return -rv; + } + return 0; +} + +int UvOsTruncate(uv_file fd, off_t offset) +{ + struct uv_fs_s req; + return uv_fs_ftruncate(NULL, &req, fd, offset, NULL); +} + +int UvOsFsync(uv_file fd) +{ + struct uv_fs_s req; + return uv_fs_fsync(NULL, &req, fd, NULL); +} + +int UvOsFdatasync(uv_file fd) +{ + struct uv_fs_s req; + return uv_fs_fdatasync(NULL, &req, fd, NULL); +} + +int UvOsStat(const char *path, uv_stat_t *sb) +{ + struct uv_fs_s req; + int rv; + rv = uv_fs_stat(NULL, &req, path, NULL); + if (rv != 0) { + return rv; + } + memcpy(sb, &req.statbuf, sizeof *sb); + return 0; +} + +int UvOsWrite(uv_file fd, + const uv_buf_t bufs[], + unsigned int nbufs, + int64_t offset) +{ + struct uv_fs_s req; + return uv_fs_write(NULL, &req, fd, bufs, nbufs, offset, NULL); +} + +int UvOsUnlink(const char *path) +{ + struct uv_fs_s req; + return uv_fs_unlink(NULL, &req, path, NULL); +} + +int UvOsRename(const char *path1, const char *path2) +{ + struct uv_fs_s req; + return uv_fs_rename(NULL, &req, path1, path2, NULL); +} + +int UvOsJoin(const char *dir, const char *filename, char *path) +{ + if (!UV__DIR_HAS_VALID_LEN(dir) || + !UV__FILENAME_HAS_VALID_LEN(filename)) { + return -1; + } + strcpy(path, dir); + strcat(path, "/"); + strcat(path, filename); + return 0; +} + +int UvOsIoSetup(unsigned nr, aio_context_t *ctxp) +{ + int rv; + rv = io_setup(nr, ctxp); + if (rv == -1) { + return -errno; + } + return 0; +} + +int UvOsIoDestroy(aio_context_t ctx) +{ + int rv; + rv = io_destroy(ctx); + if (rv == -1) { + return -errno; + } + return 0; +} + +int UvOsIoSubmit(aio_context_t ctx, long nr, struct iocb **iocbpp) +{ + int rv; + rv = io_submit(ctx, nr, iocbpp); + if (rv == -1) { + return -errno; + } + assert(rv == nr); /* TODO: can something else be returned? */ + return 0; +} + +int UvOsIoGetevents(aio_context_t ctx, + long min_nr, + long max_nr, + struct io_event *events, + struct timespec *timeout) +{ + int rv; + do { + rv = io_getevents(ctx, min_nr, max_nr, events, timeout); + } while (rv == -1 && errno == EINTR); + + if (rv == -1) { + return -errno; + } + assert(rv >= min_nr); + assert(rv <= max_nr); + return rv; +} + +int UvOsEventfd(unsigned int initval, int flags) +{ + int rv; + /* At the moment only UV_FS_O_NONBLOCK is supported */ + assert(flags == UV_FS_O_NONBLOCK); + flags = EFD_NONBLOCK | EFD_CLOEXEC; + rv = eventfd(initval, flags); + if (rv == -1) { + return -errno; + } + return rv; +} + +int UvOsSetDirectIo(uv_file fd) +{ + int flags; /* Current fcntl flags */ + int rv; + flags = fcntl(fd, F_GETFL); + rv = fcntl(fd, F_SETFL, flags | UV_FS_O_DIRECT); + if (rv == -1) { + return -errno; + } + return 0; +} diff --git a/src/raft/uv_os.h b/src/raft/uv_os.h new file mode 100644 index 000000000..741dd8887 --- /dev/null +++ b/src/raft/uv_os.h @@ -0,0 +1,95 @@ +/* Operating system related utilities. */ + +#ifndef UV_OS_H_ +#define UV_OS_H_ + +#include +#include +#include +#include +#include + +/* Maximum size of a full file system path string. */ +#define UV__PATH_SZ 1024 + +/* Maximum length of a filename string. */ +#define UV__FILENAME_LEN 128 + +/* Length of path separator. */ +#define UV__SEP_LEN 1 /* strlen("/") */ + +/* True if STR's length is at most LEN. */ +#define LEN_AT_MOST_(STR, LEN) (strnlen(STR, LEN + 1) <= LEN) + +/* Maximum length of a directory path string. */ +#define UV__DIR_LEN (UV__PATH_SZ - UV__SEP_LEN - UV__FILENAME_LEN - 1) + +/* True if the given DIR string has at most UV__DIR_LEN chars. */ +#define UV__DIR_HAS_VALID_LEN(DIR) LEN_AT_MOST_(DIR, UV__DIR_LEN) + +/* True if the given FILENAME string has at most UV__FILENAME_LEN chars. */ +#define UV__FILENAME_HAS_VALID_LEN(FILENAME) \ + LEN_AT_MOST_(FILENAME, UV__FILENAME_LEN) + +/* Portable open() */ +int UvOsOpen(const char *path, int flags, int mode, uv_file *fd); + +/* Portable close() */ +int UvOsClose(uv_file fd); + +/* TODO: figure a portable abstraction. */ +int UvOsFallocate(uv_file fd, off_t offset, off_t len); + +/* Emulation to use in case UvOsFallocate fails with -EONOTSUPP. + * This might happen with a libc implementation (e.g. musl) that + * doesn't implement a transparent fallback if fallocate() is + * not supported by the underlying file system. */ +int UvOsFallocateEmulation(int fd, off_t offset, off_t len); + +/* Portable truncate() */ +int UvOsTruncate(uv_file fd, off_t offset); + +/* Portable fsync() */ +int UvOsFsync(uv_file fd); + +/* Portable fdatasync() */ +int UvOsFdatasync(uv_file fd); + +/* Portable stat() */ +int UvOsStat(const char *path, uv_stat_t *sb); + +/* Portable write() */ +int UvOsWrite(uv_file fd, + const uv_buf_t bufs[], + unsigned int nbufs, + int64_t offset); + +/* Portable unlink() */ +int UvOsUnlink(const char *path); + +/* Portable rename() */ +int UvOsRename(const char *path1, const char *path2); + +/* Join dir and filename into a full OS path. */ +int UvOsJoin(const char *dir, const char *filename, char *path); + +/* TODO: figure a portable abstraction. */ +int UvOsIoSetup(unsigned nr, aio_context_t *ctxp); +int UvOsIoDestroy(aio_context_t ctx); +int UvOsIoSubmit(aio_context_t ctx, long nr, struct iocb **iocbpp); +int UvOsIoGetevents(aio_context_t ctx, + long min_nr, + long max_nr, + struct io_event *events, + struct timespec *timeout); +int UvOsEventfd(unsigned int initval, int flags); +int UvOsSetDirectIo(uv_file fd); + +/* Format an error message caused by a failed system call or stdlib function. */ +#define UvOsErrMsg(ERRMSG, SYSCALL, ERRNUM) \ + { \ + ErrMsgPrintf(ERRMSG, "%s", uv_strerror(ERRNUM)); \ + ErrMsgWrapf(ERRMSG, SYSCALL); \ + } + +#endif /* UV_OS_H_ */ diff --git a/src/raft/uv_prepare.c b/src/raft/uv_prepare.c new file mode 100644 index 000000000..00355480f --- /dev/null +++ b/src/raft/uv_prepare.c @@ -0,0 +1,339 @@ +#include +#include + +#include "assert.h" +#include "heap.h" +#include "uv.h" +#include "uv_os.h" + +/* The happy path for UvPrepare is: + * + * - If there is an unused open segment available, return its fd and counter + * immediately. + * + * - Otherwise, wait for the creation of a new open segment to complete, + * possibly kicking off the creation logic if no segment is being created + * currently. + * + * Possible failure modes are: + * + * - The create file request fails, in that case we fail all pending prepare + * requests and we mark the uv instance as errored. + * + * On close: + * + * - Cancel all pending prepare requests. + * - Remove unused prepared open segments. + * - Wait for any pending internal segment creation and then discard the newly + * created segment. + */ + +/* Number of open segments that we try to keep ready for writing. */ +#define UV__TARGET_POOL_SIZE 2 + +/* An open segment being prepared or sitting in the pool */ +struct uvIdleSegment +{ + struct uv *uv; /* Open segment file */ + size_t size; /* Segment size */ + struct uv_work_s work; /* To execute logic in the threadpool */ + int status; /* Result of threadpool callback */ + char errmsg[RAFT_ERRMSG_BUF_SIZE]; /* Error of threadpool callback */ + unsigned long long counter; /* Segment counter */ + char filename[UV__FILENAME_LEN]; /* Filename of the segment */ + uv_file fd; /* File descriptor of prepared file */ + queue queue; /* Pool */ +}; + +static void uvPrepareWorkCb(uv_work_t *work) +{ + struct uvIdleSegment *segment = work->data; + struct uv *uv = segment->uv; + int rv; + + rv = UvFsAllocateFile(uv->dir, segment->filename, segment->size, + &segment->fd, uv->fallocate, segment->errmsg); + if (rv != 0) { + goto err; + } + + rv = UvFsSyncDir(uv->dir, segment->errmsg); + if (rv != 0) { + goto err_after_allocate; + } + + segment->status = 0; + return; + +err_after_allocate: + UvOsClose(segment->fd); +err: + assert(rv != 0); + segment->status = rv; + return; +} + +/* Flush all pending requests, invoking their callbacks with the given + * status. */ +static void uvPrepareFinishAllRequests(struct uv *uv, int status) +{ + while (!QUEUE_IS_EMPTY(&uv->prepare_reqs)) { + queue *head; + struct uvPrepare *req; + head = QUEUE_HEAD(&uv->prepare_reqs); + req = QUEUE_DATA(head, struct uvPrepare, queue); + QUEUE_REMOVE(&req->queue); + req->cb(req, status); + } +} + +/* Pop the oldest prepared segment in the pool and return its fd and counter + * through the given pointers. */ +static void uvPrepareConsume(struct uv *uv, uv_file *fd, uvCounter *counter) +{ + queue *head; + struct uvIdleSegment *segment; + /* Pop a segment from the pool. */ + head = QUEUE_HEAD(&uv->prepare_pool); + segment = QUEUE_DATA(head, struct uvIdleSegment, queue); + assert(segment->fd >= 0); + QUEUE_REMOVE(&segment->queue); + *fd = segment->fd; + *counter = segment->counter; + RaftHeapFree(segment); +} + +/* Finish the oldest pending prepare request using the next available prepared + * segment. */ +static void uvPrepareFinishOldestRequest(struct uv *uv) +{ + queue *head; + struct uvPrepare *req; + + assert(!uv->closing); + assert(!QUEUE_IS_EMPTY(&uv->prepare_reqs)); + assert(!QUEUE_IS_EMPTY(&uv->prepare_pool)); + + /* Pop the head of the prepare requests queue. */ + head = QUEUE_HEAD(&uv->prepare_reqs); + req = QUEUE_DATA(head, struct uvPrepare, queue); + QUEUE_REMOVE(&req->queue); + + /* Finish the request */ + uvPrepareConsume(uv, &req->fd, &req->counter); + req->cb(req, 0); +} + +/* Return the number of ready prepared open segments in the pool. */ +static unsigned uvPrepareCount(struct uv *uv) +{ + queue *head; + unsigned n; + n = 0; + QUEUE_FOREACH(head, &uv->prepare_pool) + { + n++; + } + return n; +} + +static void uvPrepareAfterWorkCb(uv_work_t *work, int status); + +/* Start creating a new segment file. */ +static int uvPrepareStart(struct uv *uv) +{ + struct uvIdleSegment *segment; + int rv; + + assert(uv->prepare_inflight == NULL); + assert(uvPrepareCount(uv) < UV__TARGET_POOL_SIZE); + + segment = RaftHeapMalloc(sizeof *segment); + if (segment == NULL) { + rv = RAFT_NOMEM; + goto err; + } + + memset(segment, 0, sizeof *segment); + segment->uv = uv; + segment->counter = uv->prepare_next_counter; + segment->work.data = segment; + segment->fd = -1; + segment->size = uv->block_size * uvSegmentBlocks(uv); + sprintf(segment->filename, UV__OPEN_TEMPLATE, segment->counter); + + tracef("create open segment %s", segment->filename); + rv = uv_queue_work(uv->loop, &segment->work, uvPrepareWorkCb, + uvPrepareAfterWorkCb); + if (rv != 0) { + /* UNTESTED: with the current libuv implementation this can't + * fail. */ + tracef("can't create segment %s: %s", segment->filename, + uv_strerror(rv)); + rv = RAFT_IOERR; + goto err_after_segment_alloc; + } + + uv->prepare_inflight = segment; + uv->prepare_next_counter++; + + return 0; + +err_after_segment_alloc: + RaftHeapFree(segment); +err: + assert(rv != 0); + return rv; +} + +static void uvPrepareAfterWorkCb(uv_work_t *work, int status) +{ + struct uvIdleSegment *segment = work->data; + struct uv *uv = segment->uv; + int rv; + assert(status == 0); + + uv->prepare_inflight = + NULL; /* Reset the creation in-progress marker. */ + + /* If we are closing, let's discard the segment. All pending requests + * have already being fired with RAFT_CANCELED. */ + if (uv->closing) { + assert(QUEUE_IS_EMPTY(&uv->prepare_pool)); + assert(QUEUE_IS_EMPTY(&uv->prepare_reqs)); + if (segment->status == 0) { + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + UvOsClose(segment->fd); + UvFsRemoveFile(uv->dir, segment->filename, errmsg); + } + tracef("canceled creation of %s", segment->filename); + RaftHeapFree(segment); + uvMaybeFireCloseCb(uv); + return; + } + + /* If the request has failed, mark all pending requests as failed and + * don't try to create any further segment. + * + * Note that if there's no pending request, we don't set the error + * message, to avoid overwriting previous errors. */ + if (segment->status != 0) { + if (!QUEUE_IS_EMPTY(&uv->prepare_reqs)) { + ErrMsgTransferf(segment->errmsg, uv->io->errmsg, + "create segment %s", segment->filename); + uvPrepareFinishAllRequests(uv, segment->status); + } + uv->errored = true; + RaftHeapFree(segment); + return; + } + + assert(segment->fd >= 0); + + tracef("completed creation of %s", segment->filename); + QUEUE_PUSH(&uv->prepare_pool, &segment->queue); + + /* Let's process any pending request. */ + if (!QUEUE_IS_EMPTY(&uv->prepare_reqs)) { + uvPrepareFinishOldestRequest(uv); + } + + /* If we are already creating a segment, we're done. */ + if (uv->prepare_inflight != NULL) { + return; + } + + /* If we have already enough prepared open segments, we're done. There + * can't be any outstanding prepare requests, since if the request queue + * was not empty, we would have called uvPrepareFinishOldestRequest() + * above, thus reducing the pool size and making it smaller than the + * target size. */ + if (uvPrepareCount(uv) >= UV__TARGET_POOL_SIZE) { + assert(QUEUE_IS_EMPTY(&uv->prepare_reqs)); + return; + } + + /* Let's start preparing a new open segment. */ + rv = uvPrepareStart(uv); + if (rv != 0) { + uvPrepareFinishAllRequests(uv, rv); + uv->errored = true; + } +} + +/* Discard a prepared open segment, closing its file descriptor and removing the + * underlying file. */ +static void uvPrepareDiscard(struct uv *uv, uv_file fd, uvCounter counter) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + char filename[UV__FILENAME_LEN]; + assert(counter > 0); + assert(fd >= 0); + sprintf(filename, UV__OPEN_TEMPLATE, counter); + UvOsClose(fd); + UvFsRemoveFile(uv->dir, filename, errmsg); +} + +int UvPrepare(struct uv *uv, + uv_file *fd, + uvCounter *counter, + struct uvPrepare *req, + uvPrepareCb cb) +{ + int rv; + + assert(!uv->closing); + + if (!QUEUE_IS_EMPTY(&uv->prepare_pool)) { + uvPrepareConsume(uv, fd, counter); + goto maybe_start; + } + + *fd = -1; + *counter = 0; + req->cb = cb; + QUEUE_PUSH(&uv->prepare_reqs, &req->queue); + +maybe_start: + /* If we are already creating a segment, let's just wait. */ + if (uv->prepare_inflight != NULL) { + return 0; + } + + rv = uvPrepareStart(uv); + if (rv != 0) { + goto err; + } + + return 0; + +err: + if (*fd != -1) { + uvPrepareDiscard(uv, *fd, *counter); + } else { + QUEUE_REMOVE(&req->queue); + } + assert(rv != 0); + return rv; +} + +void UvPrepareClose(struct uv *uv) +{ + assert(uv->closing); + + /* Cancel all pending prepare requests. */ + uvPrepareFinishAllRequests(uv, RAFT_CANCELED); + + /* Remove any unused prepared segment. */ + while (!QUEUE_IS_EMPTY(&uv->prepare_pool)) { + queue *head; + struct uvIdleSegment *segment; + head = QUEUE_HEAD(&uv->prepare_pool); + segment = QUEUE_DATA(head, struct uvIdleSegment, queue); + QUEUE_REMOVE(&segment->queue); + uvPrepareDiscard(uv, segment->fd, segment->counter); + RaftHeapFree(segment); + } +} + +#undef tracef diff --git a/src/raft/uv_recv.c b/src/raft/uv_recv.c new file mode 100644 index 000000000..72d6f5ec6 --- /dev/null +++ b/src/raft/uv_recv.c @@ -0,0 +1,423 @@ +#include + +#include "../raft.h" + +#include "assert.h" +#include "byte.h" +#include "configuration.h" +#include "err.h" +#include "heap.h" +#include "uv.h" +#include "uv_encoding.h" + +/* The happy path for a receiving an RPC message is: + * + * - When a peer server successfully establishes a new connection with us, the + * transport invokes our accept callback. + * + * - A new server object is created and added to the servers array. It starts + * reading from the stream handle of the new connection. + * + * - The RPC message preamble is read, which contains the message type and the + * message length. + * + * - The RPC message header is read, whose content depends on the message type. + * + * - Optionally, the RPC message payload is read (for AppendEntries requests). + * + * - The recv callback passed to raft_io->start() gets fired with the received + * message. + * + * Possible failure modes are: + * + * - The peer server disconnects. In this case the read callback will fire with + * UV_EOF, we'll close the stream handle and then release all memory + * associated with the server object. + * + * - The peer server sends us invalid data. In this case we close the stream + * handle and act like above. + */ + +struct uvServer +{ + struct uv *uv; /* libuv I/O implementation object */ + raft_id id; /* ID of the remote server */ + char *address; /* Address of the other server */ + struct uv_stream_s *stream; /* Connection handle */ + uv_buf_t buf; /* Sliding buffer for reading incoming data */ + uint64_t preamble[2]; /* Static buffer with the request preamble */ + uv_buf_t header; /* Dynamic buffer with the request header */ + uv_buf_t payload; /* Dynamic buffer with the request payload */ + struct raft_message message; /* The message being received */ + queue queue; /* Servers queue */ +}; + +/* Initialize a new server object for reading requests from an incoming + * connection. */ +static int uvServerInit(struct uvServer *s, + struct uv *uv, + const raft_id id, + const char *address, + struct uv_stream_s *stream) +{ + s->uv = uv; + s->id = id; + s->address = RaftHeapMalloc(strlen(address) + 1); + if (s->address == NULL) { + return RAFT_NOMEM; + } + strcpy(s->address, address); + s->stream = stream; + s->stream->data = s; + s->buf.base = NULL; + s->buf.len = 0; + s->preamble[0] = 0; + s->preamble[1] = 0; + s->header.base = NULL; + s->header.len = 0; + s->message.type = 0; + s->payload.base = NULL; + s->payload.len = 0; + QUEUE_PUSH(&uv->servers, &s->queue); + return 0; +} + +static void uvServerDestroy(struct uvServer *s) +{ + QUEUE_REMOVE(&s->queue); + + if (s->header.base != NULL) { + /* This means we were interrupted while reading the header. */ + RaftHeapFree(s->header.base); + switch (s->message.type) { + case RAFT_IO_APPEND_ENTRIES: + RaftHeapFree(s->message.append_entries.entries); + break; + case RAFT_IO_INSTALL_SNAPSHOT: + configurationClose( + &s->message.install_snapshot.conf); + break; + } + } + if (s->payload.base != NULL) { + /* This means we were interrupted while reading the payload. */ + RaftHeapFree(s->payload.base); + } + RaftHeapFree(s->address); + RaftHeapFree(s->stream); +} + +/* Invoked to initialize the read buffer for the next asynchronous read on the + * socket. */ +static void uvServerAllocCb(uv_handle_t *handle, + size_t suggested_size, + uv_buf_t *buf) +{ + struct uvServer *s = handle->data; + (void)suggested_size; + + assert(!s->uv->closing); + + /* If this is the first read of the preamble, or of the header, or of + * the payload, then initialize the read buffer, according to the chunk + * of data that we expect next. */ + if (s->buf.len == 0) { + assert(s->buf.base == NULL); + + /* Check if we expect the preamble. */ + if (s->header.len == 0) { + assert(s->preamble[0] == 0); + assert(s->preamble[1] == 0); + s->buf.base = (char *)s->preamble; + s->buf.len = sizeof s->preamble; + goto out; + } + + /* Check if we expect the header. */ + if (s->payload.len == 0) { + assert(s->header.len > 0); + assert(s->header.base == NULL); + s->header.base = RaftHeapMalloc(s->header.len); + if (s->header.base == NULL) { + /* Setting all buffer fields to 0 will make + * read_cb fail with ENOBUFS. */ + memset(buf, 0, sizeof *buf); + return; + } + s->buf = s->header; + goto out; + } + + /* If we get here we should be expecting the payload. */ + assert(s->payload.len > 0); + s->payload.base = RaftHeapMalloc(s->payload.len); + if (s->payload.base == NULL) { + /* Setting all buffer fields to 0 will make read_cb fail + * with ENOBUFS. */ + memset(buf, 0, sizeof *buf); + return; + } + + s->buf = s->payload; + } + +out: + *buf = s->buf; +} + +/* Callback invoked afer the stream handle of this server connection has been + * closed. We can release all resources associated with the server object. */ +static void uvServerStreamCloseCb(uv_handle_t *handle) +{ + struct uvServer *s = handle->data; + struct uv *uv = s->uv; + uvServerDestroy(s); + RaftHeapFree(s); + uvMaybeFireCloseCb(uv); +} + +static void uvServerAbort(struct uvServer *s) +{ + struct uv *uv = s->uv; + QUEUE_REMOVE(&s->queue); + QUEUE_PUSH(&uv->aborting, &s->queue); + uv_close((struct uv_handle_s *)s->stream, uvServerStreamCloseCb); +} + +/* Invoke the receive callback. */ +static void uvFireRecvCb(struct uvServer *s) +{ + s->uv->recv_cb(s->uv->io, &s->message); + + /* Reset our state as we'll start reading a new message. We don't need + * to release the payload buffer, since ownership was transferred to the + * user. */ + memset(s->preamble, 0, sizeof s->preamble); + raft_free(s->header.base); + s->message.type = 0; + s->header.base = NULL; + s->header.len = 0; + s->payload.base = NULL; + s->payload.len = 0; +} + +/* Callback invoked when data has been read from the socket. */ +static void uvServerReadCb(uv_stream_t *stream, + ssize_t nread, + const uv_buf_t *buf) +{ + struct uvServer *s = stream->data; + int rv; + + (void)buf; + + assert(!s->uv->closing); + + /* If the read was successful, let's check if we have received all the + * data we expected. */ + if (nread > 0) { + size_t n = (size_t)nread; + + /* We shouldn't have read more data than the pending amount. */ + assert(n <= s->buf.len); + + /* Advance the read window */ + s->buf.base += n; + s->buf.len -= n; + + /* If there's more data to read in order to fill the current + * read buffer, just return, we'll be invoked again. */ + if (s->buf.len > 0) { + return; + } + + if (s->header.len == 0) { + /* If the header buffer is not set, it means that we've + * just completed reading the preamble. */ + assert(s->header.base == NULL); + + s->header.len = (size_t)byteFlip64(s->preamble[1]); + + /* The length of the header must be greater than zero. + */ + if (s->header.len == 0) { + tracef("message has zero length"); + goto abort; + } + } else if (s->payload.len == 0) { + /* If the payload buffer is not set, it means we just + * completed reading the message header. */ + uint64_t type; + + assert(s->header.base != NULL); + + type = byteFlip64(s->preamble[0]); + + /* Only use first 2 bytes of the type. Normally we would + * check if type doesn't overflow UINT16_MAX, but we + * don't do this to allow future legacy nodes to still + * handle messages that include extra information in the + * 6 unused bytes of the type field of the preamble. + * TODO: This is preparation to add the version of the + * message in the raft preamble. Once this change has + * been active for sufficiently long time, we can start + * encoding the version in some of the remaining bytes + * of s->preamble[0]. */ + rv = uvDecodeMessage((uint16_t)type, &s->header, + &s->message, &s->payload.len); + if (rv != 0) { + tracef("decode message: %s", + errCodeToString(rv)); + goto abort; + } + + s->message.server_id = s->id; + s->message.server_address = s->address; + + /* If the message has no payload, we're done. */ + if (s->payload.len == 0) { + uvFireRecvCb(s); + } + } else { + /* If we get here it means that we've just completed + * reading the payload. TODO: avoid converting from + * uv_buf_t */ + struct raft_buffer payload; + assert(s->payload.base != NULL); + assert(s->payload.len > 0); + + switch (s->message.type) { + case RAFT_IO_APPEND_ENTRIES: + payload.base = s->payload.base; + payload.len = s->payload.len; + uvDecodeEntriesBatch( + payload.base, 0, + s->message.append_entries.entries, + s->message.append_entries + .n_entries); + break; + case RAFT_IO_INSTALL_SNAPSHOT: + s->message.install_snapshot.data.base = + s->payload.base; + break; + default: + /* We should never have read a payload + * in the first place */ + assert(0); + } + + uvFireRecvCb(s); + } + + /* Mark that we're done with this chunk. When the alloc callback + * will trigger again it will notice that it needs to change the + * read buffer. */ + assert(s->buf.len == 0); + s->buf.base = NULL; + + return; + } + + /* The if nread>0 condition above should always exit the function with a + * goto abort or a return. */ + assert(nread <= 0); + + if (nread == 0) { + /* Empty read */ + return; + } + if (nread != UV_EOF) { + tracef("receive data: %s", uv_strerror((int)nread)); + } + +abort: + uvServerAbort(s); +} + +/* Start reading incoming requests. */ +static int uvServerStart(struct uvServer *s) +{ + int rv; + rv = uv_read_start(s->stream, uvServerAllocCb, uvServerReadCb); + if (rv != 0) { + tracef("start reading: %s", uv_strerror(rv)); + return RAFT_IOERR; + } + return 0; +} + +static int uvAddServer(struct uv *uv, + raft_id id, + const char *address, + struct uv_stream_s *stream) +{ + struct uvServer *server; + int rv; + + /* Initialize the new connection */ + server = RaftHeapMalloc(sizeof *server); + if (server == NULL) { + rv = RAFT_NOMEM; + goto err; + } + + rv = uvServerInit(server, uv, id, address, stream); + if (rv != 0) { + goto err_after_server_alloc; + } + + /* This will start reading requests. */ + rv = uvServerStart(server); + if (rv != 0) { + goto err_after_init_server; + } + + return 0; + +err_after_init_server: + uvServerDestroy(server); +err_after_server_alloc: + raft_free(server); +err: + assert(rv != 0); + return rv; +} + +static void uvRecvAcceptCb(struct raft_uv_transport *transport, + raft_id id, + const char *address, + struct uv_stream_s *stream) +{ + struct uv *uv = transport->data; + int rv; + assert(!uv->closing); + rv = uvAddServer(uv, id, address, stream); + if (rv != 0) { + tracef("add server: %s", errCodeToString(rv)); + uv_close((struct uv_handle_s *)stream, + (uv_close_cb)RaftHeapFree); + } +} + +int UvRecvStart(struct uv *uv) +{ + int rv; + rv = uv->transport->listen(uv->transport, uvRecvAcceptCb); + if (rv != 0) { + return rv; + } + return 0; +} + +void UvRecvClose(struct uv *uv) +{ + while (!QUEUE_IS_EMPTY(&uv->servers)) { + queue *head; + struct uvServer *server; + head = QUEUE_HEAD(&uv->servers); + server = QUEUE_DATA(head, struct uvServer, queue); + uvServerAbort(server); + } +} + +#undef tracef diff --git a/src/raft/uv_segment.c b/src/raft/uv_segment.c new file mode 100644 index 000000000..ca178238d --- /dev/null +++ b/src/raft/uv_segment.c @@ -0,0 +1,1158 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "array.h" +#include "assert.h" +#include "byte.h" +#include "configuration.h" +#include "entry.h" +#include "heap.h" +#include "uv.h" +#include "uv_encoding.h" + +/* Check if the given filename matches the one of a closed segment (xxx-yyy), or + * of an open segment (open-xxx), and fill the given info structure if so. + * + * Return true if the filename matched, false otherwise. */ +static bool uvSegmentInfoMatch(const char *filename, struct uvSegmentInfo *info) +{ + int consumed; + int matched; + size_t n; + size_t filename_len = strnlen(filename, UV__FILENAME_LEN + 1); + + assert(filename_len < UV__FILENAME_LEN); + + matched = sscanf(filename, UV__CLOSED_TEMPLATE "%n", &info->first_index, + &info->end_index, &consumed); + if (matched == 2 && consumed == (int)filename_len) { + info->is_open = false; + goto match; + } + + matched = + sscanf(filename, UV__OPEN_TEMPLATE "%n", &info->counter, &consumed); + if (matched == 1 && consumed == (int)filename_len) { + info->is_open = true; + goto match; + } + + return false; + +match: + n = sizeof(info->filename) - 1; + strncpy(info->filename, filename, n); + info->filename[n] = '\0'; + return true; +} + +int uvSegmentInfoAppendIfMatch(const char *filename, + struct uvSegmentInfo *infos[], + size_t *n_infos, + bool *appended) +{ + struct uvSegmentInfo info; + bool matched; + int rv; + + /* Check if it's a closed or open filename */ + matched = uvSegmentInfoMatch(filename, &info); + + /* If this is neither a closed or an open segment, return. */ + if (!matched) { + *appended = false; + return 0; + } + + ARRAY__APPEND(struct uvSegmentInfo, info, infos, n_infos, rv); + if (rv == -1) { + return RAFT_NOMEM; + } + + *appended = true; + + return 0; +} + +/* Compare two segments to decide which one is more recent. */ +static int uvSegmentInfoCompare(const void *p1, const void *p2) +{ + struct uvSegmentInfo *s1 = (struct uvSegmentInfo *)p1; + struct uvSegmentInfo *s2 = (struct uvSegmentInfo *)p2; + + /* Closed segments are less recent than open segments. */ + if (s1->is_open && !s2->is_open) { + return 1; + } + if (!s1->is_open && s2->is_open) { + return -1; + } + + /* If the segments are open, compare the counter. */ + if (s1->is_open) { + assert(s2->is_open); + assert(s1->counter != s2->counter); + return s1->counter < s2->counter ? -1 : 1; + } + + /* If the segments are closed, compare the first index. The index ranges + * must be disjoint. */ + if (s2->first_index > s1->end_index) { + return -1; + } + + return 1; +} + +void uvSegmentSort(struct uvSegmentInfo *infos, size_t n_infos) +{ + qsort(infos, n_infos, sizeof *infos, uvSegmentInfoCompare); +} + +int uvSegmentKeepTrailing(struct uv *uv, + struct uvSegmentInfo *segments, + size_t n, + raft_index last_index, + size_t trailing, + char *errmsg) +{ + raft_index retain_index; + size_t i; + int rv; + + assert(last_index > 0); + assert(n > 0); + + if (last_index <= trailing) { + return 0; + } + + /* Index of the oldest entry we want to retain. */ + retain_index = last_index - trailing + 1; + + for (i = 0; i < n; i++) { + struct uvSegmentInfo *segment = &segments[i]; + if (segment->is_open) { + break; + } + if (trailing == 0 || segment->end_index < retain_index) { + rv = UvFsRemoveFile(uv->dir, segment->filename, errmsg); + if (rv != 0) { + ErrMsgWrapf(errmsg, "delete closed segment %s", + segment->filename); + return rv; + } + } else { + break; + } + } + + return 0; +} + +/* Read a segment file and return its format version. */ +static int uvReadSegmentFile(struct uv *uv, + const char *filename, + struct raft_buffer *buf, + uint64_t *format) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int rv; + rv = UvFsReadFile(uv->dir, filename, buf, errmsg); + if (rv != 0) { + ErrMsgTransfer(errmsg, uv->io->errmsg, "read file"); + return RAFT_IOERR; + } + if (buf->len < 8) { + ErrMsgPrintf(uv->io->errmsg, "file has only %zu bytes", + buf->len); + RaftHeapFree(buf->base); + return RAFT_IOERR; + } + *format = byteFlip64(*(uint64_t *)buf->base); + return 0; +} + +/* Consume the content buffer, returning a pointer to the current position and + * advancing the offset of n bytes. Return an error if not enough bytes are + * available. */ +static int uvConsumeContent(const struct raft_buffer *content, + size_t *offset, + size_t n, + void **data, + char *errmsg) +{ + if (*offset + n > content->len) { + size_t remaining = content->len - *offset; + ErrMsgPrintf(errmsg, "short read: %zu bytes instead of %zu", + remaining, n); + return RAFT_IOERR; + } + if (data != NULL) { + *data = &((uint8_t *)content->base)[*offset]; + } + *offset += n; + return 0; +} + +/* Load a single batch of entries from a segment. + * + * Set @last to #true if the loaded batch is the last one. */ +static int uvLoadEntriesBatch(struct uv *uv, + const struct raft_buffer *content, + struct raft_entry **entries, + unsigned *n_entries, + size_t *offset, /* Offset of last batch */ + bool *last) +{ + void *checksums; /* CRC32 checksums */ + void *batch; /* Entries batch */ + unsigned long n; /* Number of entries in the batch */ + unsigned max_n; /* Maximum number of entries we expect */ + unsigned i; /* Iterate through the entries */ + struct raft_buffer header; /* Batch header */ + struct raft_buffer data; /* Batch data */ + uint32_t crc1; /* Target checksum */ + uint32_t crc2; /* Actual checksum */ + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + size_t start; + int rv; + + /* Save the current offset, to provide more information when logging. */ + start = *offset; + + /* Read the checksums. */ + rv = uvConsumeContent(content, offset, sizeof(uint32_t) * 2, &checksums, + errmsg); + if (rv != 0) { + ErrMsgTransfer(errmsg, uv->io->errmsg, "read preamble"); + return RAFT_IOERR; + } + + /* Read the first 8 bytes of the batch, which contains the number of + * entries in the batch. */ + rv = + uvConsumeContent(content, offset, sizeof(uint64_t), &batch, errmsg); + if (rv != 0) { + ErrMsgTransfer(errmsg, uv->io->errmsg, "read preamble"); + return RAFT_IOERR; + } + + n = (size_t)byteFlip64(*(uint64_t *)batch); + if (n == 0) { + ErrMsgPrintf(uv->io->errmsg, + "entries count in preamble is zero"); + rv = RAFT_CORRUPT; + goto err; + } + + /* Very optimistic upper bound of the number of entries we should + * expect. This is mainly a protection against allocating too much + * memory. Each entry will consume at least 4 words (for term, type, + * size and payload). */ + max_n = UV__MAX_SEGMENT_SIZE / (sizeof(uint64_t) * 4); + + if (n > max_n) { + ErrMsgPrintf(uv->io->errmsg, + "entries count %lu in preamble is too high", n); + rv = RAFT_CORRUPT; + goto err; + } + + /* Consume the batch header, excluding the first 8 bytes containing the + * number of entries, which we have already read. */ + header.len = uvSizeofBatchHeader(n); + header.base = batch; + + rv = uvConsumeContent(content, offset, + uvSizeofBatchHeader(n) - sizeof(uint64_t), NULL, + errmsg); + if (rv != 0) { + ErrMsgTransfer(errmsg, uv->io->errmsg, "read header"); + rv = RAFT_IOERR; + goto err; + } + + /* Check batch header integrity. */ + crc1 = byteFlip32(((uint32_t *)checksums)[0]); + crc2 = byteCrc32(header.base, header.len, 0); + if (crc1 != crc2) { + ErrMsgPrintf(uv->io->errmsg, "header checksum mismatch"); + rv = RAFT_CORRUPT; + goto err; + } + + /* Decode the batch header, allocating the entries array. */ + rv = uvDecodeBatchHeader(header.base, entries, n_entries); + if (rv != 0) { + goto err; + } + + /* Calculate the total size of the batch data */ + data.len = 0; + for (i = 0; i < n; i++) { + data.len += (*entries)[i].buf.len; + } + data.base = (uint8_t *)content->base + *offset; + + /* Consume the batch data */ + rv = uvConsumeContent(content, offset, data.len, NULL, errmsg); + if (rv != 0) { + ErrMsgTransfer(errmsg, uv->io->errmsg, "read data"); + rv = RAFT_IOERR; + goto err_after_header_decode; + } + + /* Check batch data integrity. */ + crc1 = byteFlip32(((uint32_t *)checksums)[1]); + crc2 = byteCrc32(data.base, data.len, 0); + if (crc1 != crc2) { + ErrMsgPrintf(uv->io->errmsg, "data checksum mismatch"); + rv = RAFT_CORRUPT; + goto err_after_header_decode; + } + + uvDecodeEntriesBatch(content->base, *offset - data.len, *entries, + *n_entries); + + *last = *offset == content->len; + + return 0; + +err_after_header_decode: + RaftHeapFree(*entries); +err: + *entries = NULL; + *n_entries = 0; + assert(rv != 0); + *offset = start; + return rv; +} + +/* Append to @entries2 all entries in @entries1. */ +static int extendEntries(const struct raft_entry *entries1, + const size_t n_entries1, + struct raft_entry **entries2, + size_t *n_entries2) +{ + struct raft_entry *entries; /* To re-allocate the given entries */ + size_t i; + + entries = raft_realloc(*entries2, + (*n_entries2 + n_entries1) * sizeof *entries); + if (entries == NULL) { + return RAFT_NOMEM; + } + + for (i = 0; i < n_entries1; i++) { + entries[*n_entries2 + i] = entries1[i]; + } + + *entries2 = entries; + *n_entries2 += n_entries1; + + return 0; +} + +int uvSegmentLoadClosed(struct uv *uv, + struct uvSegmentInfo *info, + struct raft_entry *entries[], + size_t *n) +{ + bool empty; /* Whether the file is empty */ + uint64_t format; /* Format version */ + bool last; /* Whether the last batch was reached */ + struct raft_entry *tmp_entries; /* Entries in current batch */ + struct raft_buffer buf; /* Segment file content */ + size_t offset; /* Content read cursor */ + unsigned tmp_n; /* Number of entries in current batch */ + unsigned expected_n; /* Number of entries that we expect to find */ + int i; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int rv; + + expected_n = (unsigned)(info->end_index - info->first_index + 1); + + /* If the segment is completely empty, just bail out. */ + rv = UvFsFileIsEmpty(uv->dir, info->filename, &empty, errmsg); + if (rv != 0) { + tracef("stat %s: %s", info->filename, errmsg); + rv = RAFT_IOERR; + goto err; + } + if (empty) { + ErrMsgPrintf(uv->io->errmsg, "file is empty"); + rv = RAFT_CORRUPT; + goto err; + } + + /* Open the segment file. */ + rv = uvReadSegmentFile(uv, info->filename, &buf, &format); + if (rv != 0) { + goto err; + } + if (format != UV__DISK_FORMAT) { + ErrMsgPrintf(uv->io->errmsg, "unexpected format version %ju", + format); + rv = RAFT_CORRUPT; + goto err_after_read; + } + + /* Load all batches in the segment. */ + *entries = NULL; + *n = 0; + + last = false; + offset = sizeof format; + for (i = 1; !last; i++) { + rv = uvLoadEntriesBatch(uv, &buf, &tmp_entries, &tmp_n, &offset, + &last); + if (rv != 0) { + ErrMsgWrapf(uv->io->errmsg, + "entries batch %u starting at byte %zu", i, + offset); + /* Clean up the last allocation from extendEntries. */ + goto err_after_extend_entries; + } + rv = extendEntries(tmp_entries, tmp_n, entries, n); + if (rv != 0) { + goto err_after_batch_load; + } + raft_free(tmp_entries); + } + + if (*n != expected_n) { + ErrMsgPrintf(uv->io->errmsg, "found %zu entries (expected %u)", + *n, expected_n); + rv = RAFT_CORRUPT; + goto err_after_extend_entries; + } + + assert(i > 1); /* At least one batch was loaded. */ + assert(*n > 0); /* At least one entry was loaded. */ + + return 0; + +err_after_batch_load: + raft_free(tmp_entries[0].batch); + raft_free(tmp_entries); + +err_after_extend_entries: + if (*entries != NULL) { + RaftHeapFree(*entries); + } + +err_after_read: + RaftHeapFree(buf.base); + +err: + assert(rv != 0); + + return rv; +} + +/* Check if the content of the segment file contains all zeros from the current + * offset onward. */ +static bool uvContentHasOnlyTrailingZeros(const struct raft_buffer *buf, + size_t offset) +{ + size_t i; + + for (i = offset; i < buf->len; i++) { + if (((char *)buf->base)[i] != 0) { + return false; + } + } + + return true; +} + +/* Load all entries contained in an open segment. */ +static int uvSegmentLoadOpen(struct uv *uv, + struct uvSegmentInfo *info, + struct raft_entry *entries[], + size_t *n, + raft_index *next_index) +{ + raft_index first_index; /* Index of first entry in segment */ + bool all_zeros; /* Whether the file is zero'ed */ + bool empty; /* Whether the segment file is empty */ + bool remove = false; /* Whether to remove this segment */ + bool last = false; /* Whether the last batch was reached */ + uint64_t format; /* Format version */ + size_t n_batches = 0; /* Number of loaded batches */ + struct raft_entry *tmp_entries; /* Entries in current batch */ + struct raft_buffer buf = {0}; /* Segment file content */ + size_t offset; /* Content read cursor */ + unsigned tmp_n_entries; /* Number of entries in current batch */ + int i; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int rv; + + first_index = *next_index; + + rv = UvFsFileIsEmpty(uv->dir, info->filename, &empty, errmsg); + if (rv != 0) { + tracef("check if %s is empty: %s", info->filename, errmsg); + rv = RAFT_IOERR; + goto err; + } + + if (empty) { + /* Empty segment, let's discard it. */ + tracef("remove empty open segment %s", info->filename); + remove = true; + goto done; + } + + rv = uvReadSegmentFile(uv, info->filename, &buf, &format); + if (rv != 0) { + goto err; + } + + /* Check that the format is the expected one, or perhaps 0, indicating + * that the segment was allocated but never written. */ + offset = sizeof format; + if (format != UV__DISK_FORMAT) { + if (format == 0) { + all_zeros = uvContentHasOnlyTrailingZeros(&buf, offset); + if (all_zeros) { + /* This is equivalent to the empty case, let's + * remove the segment. */ + tracef("remove zeroed open segment %s", + info->filename); + remove = true; + RaftHeapFree(buf.base); + buf.base = NULL; + goto done; + } + } + ErrMsgPrintf(uv->io->errmsg, "unexpected format version %ju", + format); + rv = RAFT_CORRUPT; + goto err_after_read; + } + + /* Load all batches in the segment. */ + for (i = 1; !last; i++) { + rv = uvLoadEntriesBatch(uv, &buf, &tmp_entries, &tmp_n_entries, + &offset, &last); + if (rv != 0) { + /* If this isn't a decoding error, just bail out. */ + if (rv != RAFT_CORRUPT) { + ErrMsgWrapf( + uv->io->errmsg, + "entries batch %u starting at byte %zu", i, + offset); + goto err_after_read; + } + + /* If this is a decoding error, and not an OS error, + * check if the rest of the file is filled with zeros. + * In that case we assume that the server shutdown + * uncleanly and we just truncate this incomplete data. + */ + all_zeros = uvContentHasOnlyTrailingZeros(&buf, offset); + if (!all_zeros) { + tracef("%s has non-zero trail", info->filename); + } + + tracef( + "truncate open segment %s at %zu (batch %d), since " + "it has " + "corrupted " + "entries", + info->filename, offset, i); + + break; + } + + rv = extendEntries(tmp_entries, tmp_n_entries, entries, n); + if (rv != 0) { + goto err_after_batch_load; + } + + raft_free(tmp_entries); + + n_batches++; + *next_index += tmp_n_entries; + } + + if (n_batches == 0) { + RaftHeapFree(buf.base); + buf.base = NULL; + remove = true; + } + +done: + /* If the segment has no valid entries in it, we remove it. Otherwise we + * rename it and keep it. */ + if (remove) { + rv = UvFsRemoveFile(uv->dir, info->filename, errmsg); + if (rv != 0) { + tracef("unlink %s: %s", info->filename, errmsg); + rv = RAFT_IOERR; + goto err_after_read; + } + } else { + char filename[UV__SEGMENT_FILENAME_BUF_SIZE]; + raft_index end_index = *next_index - 1; + + /* At least one entry was loaded */ + assert(end_index >= first_index); + int nb = snprintf(filename, sizeof(filename), + UV__CLOSED_TEMPLATE, first_index, end_index); + if ((nb < 0) || ((size_t)nb >= sizeof(filename))) { + tracef("snprintf failed: %d", nb); + rv = RAFT_IOERR; + goto err; + } + + tracef("finalize %s into %s", info->filename, filename); + + rv = UvFsTruncateAndRenameFile( + uv->dir, (size_t)offset, info->filename, filename, errmsg); + if (rv != 0) { + tracef("finalize %s: %s", info->filename, errmsg); + rv = RAFT_IOERR; + goto err; + } + + info->is_open = false; + info->first_index = first_index; + info->end_index = end_index; + memset(info->filename, '\0', sizeof(info->filename)); + _Static_assert(sizeof(info->filename) >= sizeof(filename), + "Destination buffer too small"); + /* info->filename is zeroed out, info->filename is at least as + * large as filename and we checked that nb < sizeof(filename) + * -> we won't overflow and the result will be zero terminated. + */ + memcpy(info->filename, filename, (size_t)nb); + } + + return 0; + +err_after_batch_load: + raft_free(tmp_entries[0].batch); + raft_free(tmp_entries); + +err_after_read: + if (buf.base != NULL) { + RaftHeapFree(buf.base); + } + +err: + assert(rv != 0); + + return rv; +} + +/* Ensure that the write buffer of the given segment is large enough to hold the + * the given number of bytes size. */ +static int uvEnsureSegmentBufferIsLargeEnough(struct uvSegmentBuffer *b, + size_t size) +{ + unsigned n = (unsigned)(size / b->block_size); + void *base; + size_t len; + + if (b->arena.len >= size) { + assert(b->arena.base != NULL); + return 0; + } + + if (size % b->block_size != 0) { + n++; + } + + len = b->block_size * n; + base = raft_aligned_alloc(b->block_size, len); + if (base == NULL) { + return RAFT_NOMEM; + } + memset(base, 0, len); + + /* If the current arena is initialized, we need to copy its content, + * since it might have data that we want to retain in the next write. */ + if (b->arena.base != NULL) { + assert(b->arena.len >= b->block_size); + memcpy(base, b->arena.base, b->arena.len); + raft_aligned_free(b->block_size, b->arena.base); + } + + b->arena.base = base; + b->arena.len = len; + + return 0; +} + +void uvSegmentBufferInit(struct uvSegmentBuffer *b, size_t block_size) +{ + b->block_size = block_size; + b->arena.base = NULL; + b->arena.len = 0; + b->n = 0; +} + +void uvSegmentBufferClose(struct uvSegmentBuffer *b) +{ + if (b->arena.base != NULL) { + raft_aligned_free(b->block_size, b->arena.base); + } +} + +int uvSegmentBufferFormat(struct uvSegmentBuffer *b) +{ + int rv; + void *cursor; + size_t n; + assert(b->n == 0); + n = sizeof(uint64_t); + rv = uvEnsureSegmentBufferIsLargeEnough(b, n); + if (rv != 0) { + return rv; + } + b->n = n; + cursor = b->arena.base; + bytePut64(&cursor, UV__DISK_FORMAT); + return 0; +} + +int uvSegmentBufferAppend(struct uvSegmentBuffer *b, + const struct raft_entry entries[], + unsigned n_entries) +{ + size_t size; /* Total size of the batch */ + uint32_t crc1; /* Header checksum */ + uint32_t crc2; /* Data checksum */ + void *crc1_p; /* Pointer to header checksum slot */ + void *crc2_p; /* Pointer to data checksum slot */ + void *header; /* Pointer to the header section */ + void *cursor; + unsigned i; + int rv; + + size = sizeof(uint32_t) * 2; /* CRC checksums */ + size += uvSizeofBatchHeader(n_entries); /* Batch header */ + for (i = 0; i < n_entries; i++) { /* Entries data */ + size += bytePad64(entries[i].buf.len); + } + + rv = uvEnsureSegmentBufferIsLargeEnough(b, b->n + size); + if (rv != 0) { + return rv; + } + cursor = b->arena.base + b->n; + + /* Placeholder of the checksums */ + crc1_p = cursor; + bytePut32(&cursor, 0); + crc2_p = cursor; + bytePut32(&cursor, 0); + + /* Batch header */ + header = cursor; + uvEncodeBatchHeader(entries, n_entries, cursor); + crc1 = byteCrc32(header, uvSizeofBatchHeader(n_entries), 0); + cursor = (uint8_t *)cursor + uvSizeofBatchHeader(n_entries); + + /* Batch data */ + crc2 = 0; + for (i = 0; i < n_entries; i++) { + const struct raft_entry *entry = &entries[i]; + assert(entry->buf.len % sizeof(uint64_t) == 0); + memcpy(cursor, entry->buf.base, entry->buf.len); + crc2 = byteCrc32(cursor, entry->buf.len, crc2); + cursor = (uint8_t *)cursor + entry->buf.len; + } + + bytePut32(&crc1_p, crc1); + bytePut32(&crc2_p, crc2); + b->n += size; + + return 0; +} + +void uvSegmentBufferFinalize(struct uvSegmentBuffer *b, uv_buf_t *out) +{ + unsigned n_blocks; + unsigned tail; + + n_blocks = (unsigned)(b->n / b->block_size); + if (b->n % b->block_size != 0) { + n_blocks++; + } + + /* Set the remainder of the last block to 0 */ + tail = (unsigned)(b->n % b->block_size); + if (tail != 0) { + memset(b->arena.base + b->n, 0, b->block_size - tail); + } + + out->base = b->arena.base; + out->len = n_blocks * b->block_size; +} + +void uvSegmentBufferReset(struct uvSegmentBuffer *b, unsigned retain) +{ + assert(b->n > 0); + assert(b->arena.base != NULL); + + if (retain == 0) { + b->n = 0; + memset(b->arena.base, 0, b->block_size); + return; + } + + memcpy(b->arena.base, b->arena.base + retain * b->block_size, + b->block_size); + b->n = b->n % b->block_size; +} + +/* When a corrupted segment is detected, the segment is renamed. + * Upon a restart, raft will not detect the segment anymore and will try + * to start without it. + * */ +#define CORRUPT_FILE_FMT "corrupt-%" PRId64 "-%s" +static void uvMoveCorruptSegment(struct uv *uv, struct uvSegmentInfo *info) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; + char new_filename[UV__FILENAME_LEN + 1] = {0}; + size_t sz = sizeof(new_filename); + int rv; + + struct timespec ts = {0}; + /* Ignore errors */ + clock_gettime(CLOCK_REALTIME, &ts); + int64_t ns = ts.tv_sec * 1000000000 + ts.tv_nsec; + rv = snprintf(new_filename, sz, CORRUPT_FILE_FMT, ns, info->filename); + if (rv < 0 || rv >= (int)sz) { + tracef("snprintf %d", rv); + return; + } + + UvFsRenameFile(uv->dir, info->filename, new_filename, errmsg); + if (rv != 0) { + tracef("%s", errmsg); + return; + } +} + +/* + * On startup, raft will try to recover when a corrupt segment is detected. + * + * When a corrupt open segment is encountered, it, and all subsequent open + * segments, are renamed. Not renaming newer, possible non-corrupt, open + * segments could lead to loading inconsistent data. + * + * When a corrupt closed segment is encountered, it will be renamed when + * it is the last closed segment, in that case all open-segments are renamed + * too. + */ +static void uvRecoverFromCorruptSegment(struct uv *uv, + size_t i_corrupt, + struct uvSegmentInfo *infos, + size_t n_infos) +{ + struct uvSegmentInfo *info = &infos[i_corrupt]; + if (info->is_open) { + for (size_t i = i_corrupt; i < n_infos; ++i) { + info = &infos[i]; + uvMoveCorruptSegment(uv, info); + } + } else { + size_t i_next = i_corrupt + 1; + /* last segment or last closed segment. */ + if (i_next == n_infos || infos[i_next].is_open) { + for (size_t i = i_corrupt; i < n_infos; ++i) { + info = &infos[i]; + uvMoveCorruptSegment(uv, info); + } + } + } +} + +int uvSegmentLoadAll(struct uv *uv, + const raft_index start_index, + struct uvSegmentInfo *infos, + size_t n_infos, + struct raft_entry **entries, + size_t *n_entries) +{ + raft_index next_index; /* Next entry to load from disk */ + struct raft_entry *tmp_entries; /* Entries in current segment */ + size_t tmp_n; /* Number of entries in current segment */ + size_t i; + int rv; + + assert(start_index >= 1); + assert(n_infos > 0); + + *entries = NULL; + *n_entries = 0; + + next_index = start_index; + + for (i = 0; i < n_infos; i++) { + struct uvSegmentInfo *info = &infos[i]; + + tracef("load segment %s", info->filename); + + if (info->is_open) { + rv = uvSegmentLoadOpen(uv, info, entries, n_entries, + &next_index); + ErrMsgWrapf(uv->io->errmsg, "load open segment %s", + info->filename); + if (rv != 0) { + if (rv == RAFT_CORRUPT && uv->auto_recovery) { + uvRecoverFromCorruptSegment( + uv, i, infos, n_infos); + } + goto err; + } + } else { + assert(info->first_index >= start_index); + assert(info->first_index <= info->end_index); + + /* Check that the start index encoded in the name of the + * segment matches what we expect and there are no gaps + * in the sequence. */ + if (info->first_index != next_index) { + ErrMsgPrintf(uv->io->errmsg, + "unexpected closed segment %s: " + "first index should " + "have been %llu", + info->filename, next_index); + rv = RAFT_CORRUPT; + goto err; + } + + rv = + uvSegmentLoadClosed(uv, info, &tmp_entries, &tmp_n); + if (rv != 0) { + ErrMsgWrapf(uv->io->errmsg, + "load closed segment %s", + info->filename); + if (rv == RAFT_CORRUPT && uv->auto_recovery) { + uvRecoverFromCorruptSegment( + uv, i, infos, n_infos); + } + goto err; + } + + assert(tmp_n > 0); + rv = extendEntries(tmp_entries, tmp_n, entries, + n_entries); + if (rv != 0) { + /* TODO: release memory of entries in + * tmp_entries */ + goto err; + } + + raft_free(tmp_entries); + next_index += tmp_n; + } + } + + return 0; + +err: + assert(rv != 0); + + /* Free any batch that we might have allocated and the entries array as + * well. */ + if (*entries != NULL) { + void *batch = NULL; + + for (i = 0; i < *n_entries; i++) { + struct raft_entry *entry = &(*entries)[i]; + + if (entry->batch != batch) { + batch = entry->batch; + raft_free(batch); + } + } + + raft_free(*entries); + *entries = NULL; + *n_entries = 0; + } + + return rv; +} + +/* Write a closed segment */ +static int uvWriteClosedSegment(struct uv *uv, + raft_index first_index, + raft_index last_index, + const struct raft_buffer *conf) +{ + char filename[UV__FILENAME_LEN]; + struct uvSegmentBuffer buf = {0}; + struct raft_buffer data; + struct raft_entry entry = {0}; + size_t cap; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int rv; + + assert(first_index <= last_index); + + /* Render the path */ + sprintf(filename, UV__CLOSED_TEMPLATE, first_index, last_index); + + /* Make sure that the given encoded configuration fits in the first + * block */ + cap = uv->block_size - + (sizeof(uint64_t) /* Format version */ + + sizeof(uint64_t) /* Checksums */ + uvSizeofBatchHeader(1)); + if (conf->len > cap) { + return RAFT_TOOBIG; + } + + uvSegmentBufferInit(&buf, uv->block_size); + + rv = uvSegmentBufferFormat(&buf); + if (rv != 0) { + return rv; + } + + entry.term = 1; + entry.type = RAFT_CHANGE; + entry.buf = *conf; + + rv = uvSegmentBufferAppend(&buf, &entry, 1); + if (rv != 0) { + uvSegmentBufferClose(&buf); + return rv; + } + + data.base = buf.arena.base; + data.len = buf.n; + rv = UvFsMakeFile(uv->dir, filename, &data, 1, errmsg); + uvSegmentBufferClose(&buf); + if (rv != 0) { + tracef("write segment %s: %s", filename, errmsg); + return RAFT_IOERR; + } + + return 0; +} + +int uvSegmentCreateFirstClosed(struct uv *uv, + const struct raft_configuration *configuration) +{ + return uvSegmentCreateClosedWithConfiguration(uv, 1, configuration); +} + +int uvSegmentCreateClosedWithConfiguration( + struct uv *uv, + raft_index index, + const struct raft_configuration *configuration) +{ + struct raft_buffer buf; + char filename[UV__FILENAME_LEN]; + int rv; + + /* Render the path */ + sprintf(filename, UV__CLOSED_TEMPLATE, index, index); + + /* Encode the given configuration. */ + rv = configurationEncode(configuration, &buf); + if (rv != 0) { + goto err; + } + + /* Write the file */ + rv = uvWriteClosedSegment(uv, index, index, &buf); + if (rv != 0) { + goto err_after_configuration_encode; + } + + raft_free(buf.base); + + rv = UvFsSyncDir(uv->dir, uv->io->errmsg); + if (rv != 0) { + return RAFT_IOERR; + } + + return 0; + +err_after_configuration_encode: + raft_free(buf.base); +err: + assert(rv != 0); + return rv; +} + +int uvSegmentTruncate(struct uv *uv, + struct uvSegmentInfo *segment, + raft_index index) +{ + char filename[UV__FILENAME_LEN]; + struct raft_entry *entries; + struct uvSegmentBuffer buf; + struct raft_buffer data; + size_t n; + unsigned m; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int rv; + + assert(!segment->is_open); + + tracef("truncate %llu-%llu at %llu", segment->first_index, + segment->end_index, index); + + rv = uvSegmentLoadClosed(uv, segment, &entries, &n); + if (rv != 0) { + ErrMsgWrapf(uv->io->errmsg, "load closed segment %s", + segment->filename); + goto out; + } + + /* Discard all entries after the truncate index (included) */ + assert(index - segment->first_index < n); + m = (unsigned)(index - segment->first_index); + + uvSegmentBufferInit(&buf, uv->block_size); + + rv = uvSegmentBufferFormat(&buf); + if (rv != 0) { + goto out_after_buffer_init; + } + + rv = uvSegmentBufferAppend(&buf, entries, m); + if (rv != 0) { + goto out_after_buffer_init; + } + + /* Render the path. + * + * TODO: we should use a temporary file name so in case of crash we + * don't consider this segment as corrupted. + */ + sprintf(filename, UV__CLOSED_TEMPLATE, segment->first_index, index - 1); + + data.base = buf.arena.base; + data.len = buf.n; + + rv = UvFsMakeFile(uv->dir, filename, &data, 1, errmsg); + if (rv != 0) { + tracef("write %s: %s", filename, errmsg); + rv = RAFT_IOERR; + goto out_after_buffer_init; + } + +out_after_buffer_init: + uvSegmentBufferClose(&buf); + entryBatchesDestroy(entries, n); +out: + return rv; +} + +#undef tracef diff --git a/src/raft/uv_send.c b/src/raft/uv_send.c new file mode 100644 index 000000000..86133542c --- /dev/null +++ b/src/raft/uv_send.c @@ -0,0 +1,519 @@ +#include + +#include "../raft.h" +#include "assert.h" +#include "heap.h" +#include "uv.h" +#include "uv_encoding.h" + +/* The happy path for an raft_io_send request is: + * + * - Get the uvClient object whose address matches the one of target server. + * - Encode the message and write it using the uvClient's TCP handle. + * - Once the write completes, fire the send request callback. + * + * Possible failure modes are: + * + * - The uv->clients queue has no client object with a matching address. In this + * case add a new client object to the array, add the send request to the + * queue of pending requests and submit a connection request. Once the + * connection request succeeds, try to write the encoded request to the + * connected stream handle. If the connection request fails, schedule another + * attempt. + * + * - The uv->clients queue has a client object which is not connected. Add the + * send request to the pending queue, and, if there's no connection attempt + * already in progress, start a new one. + * + * - The write request fails (either synchronously or asynchronously). In this + * case we fire the request callback with an error, close the connection + * stream, and start a re-connection attempt. + */ + +/* Maximum number of requests that can be buffered. */ +#define UV__CLIENT_MAX_PENDING 3 + +struct uvClient +{ + struct uv *uv; /* libuv I/O implementation object */ + struct uv_timer_s timer; /* Schedule connection attempts */ + struct raft_uv_connect connect; /* Connection request */ + struct uv_stream_s *stream; /* Current connection handle */ + struct uv_stream_s *old_stream; /* Connection handle being closed */ + unsigned n_connect_attempt; /* Consecutive connection attempts */ + raft_id id; /* ID of the other server */ + char *address; /* Address of the other server */ + queue pending; /* Pending send message requests */ + queue queue; /* Clients queue */ + bool closing; /* True after calling uvClientAbort */ +}; + +/* Hold state for a single send RPC message request. */ +struct uvSend +{ + struct uvClient *client; /* Client connected to the target server */ + struct raft_io_send *req; /* User request */ + uv_buf_t *bufs; /* Encoded raft RPC message to send */ + unsigned n_bufs; /* Number of buffers */ + uv_write_t write; /* Stream write request */ + queue queue; /* Pending send requests queue */ +}; + +/* Free all memory used by the given send request object, including the object + * itself. */ +static void uvSendDestroy(struct uvSend *s) +{ + if (s->bufs != NULL) { + /* Just release the first buffer. Further buffers are entry or + * snapshot payloads, which we were passed but we don't own. */ + RaftHeapFree(s->bufs[0].base); + + /* Release the buffers array. */ + RaftHeapFree(s->bufs); + } + RaftHeapFree(s); +} + +/* Initialize a new client associated with the given server. */ +static int uvClientInit(struct uvClient *c, + struct uv *uv, + raft_id id, + const char *address) +{ + int rv; + c->uv = uv; + c->timer.data = c; + c->connect.data = NULL; /* Set upon starting a connect request */ + c->stream = NULL; /* Set upon successful connection */ + c->old_stream = NULL; /* Set after closing the current connection */ + c->n_connect_attempt = 0; + c->id = id; + c->address = RaftHeapMalloc(strlen(address) + 1); + if (c->address == NULL) { + return RAFT_NOMEM; + } + rv = uv_timer_init(c->uv->loop, &c->timer); + assert(rv == 0); + strcpy(c->address, address); + QUEUE_INIT(&c->pending); + c->closing = false; + QUEUE_PUSH(&uv->clients, &c->queue); + return 0; +} + +/* If there's no more pending cleanup, remove the client from the abort queue + * and destroy it. */ +static void uvClientMaybeDestroy(struct uvClient *c) +{ + struct uv *uv = c->uv; + + assert(c->stream == NULL); + + if (c->connect.data != NULL) { + return; + } + if (c->timer.data != NULL) { + return; + } + if (c->old_stream != NULL) { + return; + } + + while (!QUEUE_IS_EMPTY(&c->pending)) { + queue *head; + struct uvSend *send; + struct raft_io_send *req; + head = QUEUE_HEAD(&c->pending); + send = QUEUE_DATA(head, struct uvSend, queue); + QUEUE_REMOVE(head); + req = send->req; + uvSendDestroy(send); + if (req->cb != NULL) { + req->cb(req, RAFT_CANCELED); + } + } + + QUEUE_REMOVE(&c->queue); + + assert(c->address != NULL); + RaftHeapFree(c->address); + RaftHeapFree(c); + + uvMaybeFireCloseCb(uv); +} + +/* Forward declaration. */ +static void uvClientConnect(struct uvClient *c); + +static void uvClientDisconnectCloseCb(struct uv_handle_s *handle) +{ + struct uvClient *c = handle->data; + assert(c->old_stream != NULL); + assert(c->stream == NULL); + assert(handle == (struct uv_handle_s *)c->old_stream); + RaftHeapFree(c->old_stream); + c->old_stream = NULL; + if (c->closing) { + uvClientMaybeDestroy(c); + } else { + uvClientConnect(c); /* Trigger a new connection attempt. */ + } +} + +/* Close the current connection. */ +static void uvClientDisconnect(struct uvClient *c) +{ + assert(c->stream != NULL); + assert(c->old_stream == NULL); + c->old_stream = c->stream; + c->stream = NULL; + uv_close((struct uv_handle_s *)c->old_stream, + uvClientDisconnectCloseCb); +} + +/* Invoked once an encoded RPC message has been written out. */ +static void uvSendWriteCb(struct uv_write_s *write, const int status) +{ + struct uvSend *send = write->data; + struct uvClient *c = send->client; + struct raft_io_send *req = send->req; + int cb_status = 0; + + /* If the write failed and we're not currently closing, let's consider + * the current stream handle as busted and start disconnecting (unless + * we're already doing so). We'll trigger a new connection attempt once + * the handle is closed. */ + if (status != 0) { + cb_status = RAFT_IOERR; + if (!c->closing) { + if (c->stream != NULL) { + uvClientDisconnect(c); + } + } else if (status == UV_ECANCELED) { + cb_status = RAFT_CANCELED; + } + } + + uvSendDestroy(send); + + if (req->cb != NULL) { + req->cb(req, cb_status); + } +} + +static int uvClientSend(struct uvClient *c, struct uvSend *send) +{ + int rv; + assert(!c->closing); + send->client = c; + + /* If there's no connection available, let's queue the request. */ + if (c->stream == NULL) { + tracef("no connection available -> enqueue message"); + QUEUE_PUSH(&c->pending, &send->queue); + return 0; + } + + tracef("connection available -> write message"); + send->write.data = send; + rv = uv_write(&send->write, c->stream, send->bufs, send->n_bufs, + uvSendWriteCb); + if (rv != 0) { + tracef("write message failed -> rv %d", rv); + /* UNTESTED: what are the error conditions? perhaps ENOMEM */ + return RAFT_IOERR; + } + + return 0; +} + +/* Try to execute all send requests that were blocked in the queue waiting for a + * connection. */ +static void uvClientSendPending(struct uvClient *c) +{ + int rv; + assert(c->stream != NULL); + tracef("send pending messages"); + while (!QUEUE_IS_EMPTY(&c->pending)) { + queue *head; + struct uvSend *send; + head = QUEUE_HEAD(&c->pending); + send = QUEUE_DATA(head, struct uvSend, queue); + QUEUE_REMOVE(head); + rv = uvClientSend(c, send); + if (rv != 0) { + if (send->req->cb != NULL) { + send->req->cb(send->req, rv); + } + uvSendDestroy(send); + } + } +} + +static void uvClientTimerCb(uv_timer_t *timer) +{ + struct uvClient *c = timer->data; + tracef("timer expired -> attempt to reconnect"); + uvClientConnect(c); /* Retry to connect. */ +} + +/* Return the number of send requests that we have been parked in the send queue + * because no connection is available yet. */ +static unsigned uvClientPendingCount(struct uvClient *c) +{ + queue *head; + unsigned n = 0; + QUEUE_FOREACH(head, &c->pending) + { + n++; + } + return n; +} + +static void uvClientConnectCb(struct raft_uv_connect *req, + struct uv_stream_s *stream, + int status) +{ + struct uvClient *c = req->data; + unsigned n_pending; + int rv; + + tracef("connect attempt completed -> status %s", + errCodeToString(status)); + + assert(c->connect.data != NULL); + assert(c->stream == NULL); + assert(c->old_stream == NULL); + assert(!uv_is_active((struct uv_handle_s *)&c->timer)); + + c->connect.data = NULL; + + /* If we are closing, bail out, possibly discarding the new connection. + */ + if (c->closing) { + if (status == 0) { + assert(stream != NULL); + c->stream = stream; + c->stream->data = c; + uvClientDisconnect(c); + } else { + uvClientMaybeDestroy(c); + } + return; + } + + /* If, the connection attempt was successful, we're good. If we have + * pending requests, let's try to execute them. */ + if (status == 0) { + assert(stream != NULL); + c->stream = stream; + c->n_connect_attempt = 0; + c->stream->data = c; + uvClientSendPending(c); + return; + } + + /* Shrink the queue of pending requests, by failing the oldest ones */ + n_pending = uvClientPendingCount(c); + if (n_pending > UV__CLIENT_MAX_PENDING) { + unsigned i; + for (i = 0; i < n_pending - UV__CLIENT_MAX_PENDING; i++) { + tracef("queue full -> evict oldest message"); + queue *head; + struct uvSend *old_send; + struct raft_io_send *old_req; + head = QUEUE_HEAD(&c->pending); + old_send = QUEUE_DATA(head, struct uvSend, queue); + QUEUE_REMOVE(head); + old_req = old_send->req; + uvSendDestroy(old_send); + if (old_req->cb != NULL) { + old_req->cb(old_req, RAFT_NOCONNECTION); + } + } + } + + /* Let's schedule another attempt. */ + rv = uv_timer_start(&c->timer, uvClientTimerCb, + c->uv->connect_retry_delay, 0); + assert(rv == 0); +} + +/* Perform a single connection attempt, scheduling a retry if it fails. */ +static void uvClientConnect(struct uvClient *c) +{ + int rv; + + assert(!c->closing); + assert(c->stream == NULL); + assert(c->old_stream == NULL); + assert(!uv_is_active((struct uv_handle_s *)&c->timer)); + assert(c->connect.data == NULL); + + c->n_connect_attempt++; + + c->connect.data = c; + rv = c->uv->transport->connect(c->uv->transport, &c->connect, c->id, + c->address, uvClientConnectCb); + if (rv != 0) { + /* Restart the timer, so we can retry. */ + c->connect.data = NULL; + rv = uv_timer_start(&c->timer, uvClientTimerCb, + c->uv->connect_retry_delay, 0); + assert(rv == 0); + } +} + +/* Final callback in the close chain of an io_uv__client object */ +static void uvClientTimerCloseCb(struct uv_handle_s *handle) +{ + struct uvClient *c = handle->data; + assert(handle == (struct uv_handle_s *)&c->timer); + c->timer.data = NULL; + uvClientMaybeDestroy(c); +} + +/* Start shutting down a client. This happens when the `raft_io` instance + * has been closed or when the address of the client has changed. */ +static void uvClientAbort(struct uvClient *c) +{ + struct uv *uv = c->uv; + int rv; + + assert(c->stream != NULL || c->old_stream != NULL || + uv_is_active((struct uv_handle_s *)&c->timer) || + c->connect.data != NULL); + + QUEUE_REMOVE(&c->queue); + QUEUE_PUSH(&uv->aborting, &c->queue); + + rv = uv_timer_stop(&c->timer); + assert(rv == 0); + + /* If we are connected, let's close the outbound stream handle. This + * will eventually complete all inflight write requests, possibly with + * failing them with UV_ECANCELED. */ + if (c->stream != NULL) { + uvClientDisconnect(c); + } + + /* Closing the timer implicitly stop it, so the timeout callback won't + * be fired. */ + uv_close((struct uv_handle_s *)&c->timer, uvClientTimerCloseCb); + c->closing = true; +} + +/* Find the client object associated with the given server, or create one if + * there's none yet. */ +static int uvGetClient(struct uv *uv, + const raft_id id, + const char *address, + struct uvClient **client) +{ + queue *head; + int rv; + + /* Check if we already have a client object for this peer server. */ + QUEUE_FOREACH(head, &uv->clients) + { + *client = QUEUE_DATA(head, struct uvClient, queue); + if ((*client)->id != id) { + continue; + } + + /* Client address has changed, abort connection and create a new + * one. */ + if (strcmp((*client)->address, address) != 0) { + uvClientAbort(*client); + break; + } + + return 0; + } + + /* Initialize the new connection */ + *client = RaftHeapMalloc(sizeof **client); + if (*client == NULL) { + rv = RAFT_NOMEM; + goto err; + } + + rv = uvClientInit(*client, uv, id, address); + if (rv != 0) { + goto err_after_client_alloc; + } + + /* Make a first connection attempt right away.. */ + uvClientConnect(*client); + + return 0; + +err_after_client_alloc: + RaftHeapFree(*client); +err: + assert(rv != 0); + return rv; +} + +int UvSend(struct raft_io *io, + struct raft_io_send *req, + const struct raft_message *message, + raft_io_send_cb cb) +{ + struct uv *uv = io->impl; + struct uvSend *send; + struct uvClient *client; + int rv; + + assert(!uv->closing); + + /* Allocate a new request object. */ + send = RaftHeapMalloc(sizeof *send); + if (send == NULL) { + rv = RAFT_NOMEM; + goto err; + } + send->req = req; + req->cb = cb; + + rv = uvEncodeMessage(message, &send->bufs, &send->n_bufs); + if (rv != 0) { + send->bufs = NULL; + goto err_after_send_alloc; + } + + /* Get a client object connected to the target server, creating it if it + * doesn't exist yet. */ + rv = uvGetClient(uv, message->server_id, message->server_address, + &client); + if (rv != 0) { + goto err_after_send_alloc; + } + + rv = uvClientSend(client, send); + if (rv != 0) { + goto err_after_send_alloc; + } + + return 0; + +err_after_send_alloc: + uvSendDestroy(send); +err: + assert(rv != 0); + return rv; +} + +void UvSendClose(struct uv *uv) +{ + assert(uv->closing); + while (!QUEUE_IS_EMPTY(&uv->clients)) { + queue *head; + struct uvClient *client; + head = QUEUE_HEAD(&uv->clients); + client = QUEUE_DATA(head, struct uvClient, queue); + uvClientAbort(client); + } +} + +#undef tracef diff --git a/src/raft/uv_snapshot.c b/src/raft/uv_snapshot.c new file mode 100644 index 000000000..d4b8910d1 --- /dev/null +++ b/src/raft/uv_snapshot.c @@ -0,0 +1,808 @@ +#include +#include + +#include "array.h" +#include "assert.h" +#include "byte.h" +#include "compress.h" +#include "configuration.h" +#include "heap.h" +#include "uv.h" +#include "uv_encoding.h" +#include "uv_os.h" + +/* Arbitrary maximum configuration size. Should be practically be enough */ +#define UV__META_MAX_CONFIGURATION_SIZE 1024 * 1024 + +/* Returns true if the filename is a valid snapshot file or snapshot meta + * filename depending on the `meta` switch. If the parse is successful, the + * arguments will contain the parsed values. */ +static bool uvSnapshotParseFilename(const char *filename, + bool meta, + raft_term *term, + raft_index *index, + raft_time *timestamp) +{ + /* Check if it's a well-formed snapshot filename */ + int consumed = 0; + int matched; + size_t filename_len = strlen(filename); + assert(filename_len < UV__FILENAME_LEN); + if (meta) { + matched = sscanf(filename, UV__SNAPSHOT_META_TEMPLATE "%n", + term, index, timestamp, &consumed); + } else { + matched = sscanf(filename, UV__SNAPSHOT_TEMPLATE "%n", term, + index, timestamp, &consumed); + } + if (matched != 3 || consumed != (int)filename_len) { + return false; + } + + return true; +} + +/* Check if the given filename matches the pattern of a snapshot metadata + * filename (snapshot-xxx-yyy-zzz.meta), and fill the given info structure if + * so. + * + * Return true if the filename matched, false otherwise. */ +static bool uvSnapshotInfoMatch(const char *filename, + struct uvSnapshotInfo *info) +{ + if (!uvSnapshotParseFilename(filename, true, &info->term, &info->index, + &info->timestamp)) { + return false; + } + /* Allow room for '\0' terminator */ + size_t n = sizeof(info->filename) - 1; + strncpy(info->filename, filename, n); + info->filename[n] = '\0'; + return true; +} + +void uvSnapshotFilenameOf(struct uvSnapshotInfo *info, char *filename) +{ + size_t len = strlen(info->filename) - strlen(".meta"); + assert(len < UV__FILENAME_LEN); + strcpy(filename, info->filename); + filename[len] = 0; +} + +int UvSnapshotInfoAppendIfMatch(struct uv *uv, + const char *filename, + struct uvSnapshotInfo *infos[], + size_t *n_infos, + bool *appended) +{ + struct uvSnapshotInfo info; + bool matched; + char snapshot_filename[UV__FILENAME_LEN]; + bool exists; + bool is_empty; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int rv; + + /* Check if it's a snapshot metadata filename */ + matched = uvSnapshotInfoMatch(filename, &info); + if (!matched) { + *appended = false; + return 0; + } + + /* Check if there's actually a valid snapshot file for this snapshot + * metadata. If there's none or it's empty, it means that we aborted + * before finishing the snapshot, or that another thread is still busy + * writing the snapshot. */ + uvSnapshotFilenameOf(&info, snapshot_filename); + rv = UvFsFileExists(uv->dir, snapshot_filename, &exists, errmsg); + if (rv != 0) { + tracef("stat %s: %s", snapshot_filename, errmsg); + rv = RAFT_IOERR; + return rv; + } + if (!exists) { + *appended = false; + return 0; + } + + /* TODO This check is strictly not needed, snapshot files are created by + * renaming fully written and synced tmp-files. Leaving it here, just to + * be extra-safe. Can probably be removed once more data integrity + * checks are performed at startup. */ + rv = UvFsFileIsEmpty(uv->dir, snapshot_filename, &is_empty, errmsg); + if (rv != 0) { + tracef("is_empty %s: %s", snapshot_filename, errmsg); + rv = RAFT_IOERR; + return rv; + } + if (is_empty) { + *appended = false; + return 0; + } + + ARRAY__APPEND(struct uvSnapshotInfo, info, infos, n_infos, rv); + if (rv == -1) { + return RAFT_NOMEM; + } + *appended = true; + + return 0; +} + +static int uvSnapshotIsOrphanInternal(const char *dir, + const char *filename, + bool meta, + bool *orphan) +{ + int rv; + *orphan = false; + + raft_term term; + raft_index index; + raft_time timestamp; + if (!uvSnapshotParseFilename(filename, meta, &term, &index, + ×tamp)) { + return 0; + } + + /* filename is a well-formed snapshot filename, check if the sibling + * file exists. */ + char sibling_filename[UV__FILENAME_LEN]; + if (meta) { + rv = snprintf(sibling_filename, UV__FILENAME_LEN, + UV__SNAPSHOT_TEMPLATE, term, index, timestamp); + } else { + rv = snprintf(sibling_filename, UV__FILENAME_LEN, + UV__SNAPSHOT_META_TEMPLATE, term, index, + timestamp); + } + + if (rv >= UV__FILENAME_LEN) { + /* Output truncated */ + return -1; + } + + bool sibling_exists = false; + char ignored[RAFT_ERRMSG_BUF_SIZE]; + rv = UvFsFileExists(dir, sibling_filename, &sibling_exists, ignored); + if (rv != 0) { + return rv; + } + + *orphan = !sibling_exists; + return 0; +} + +int UvSnapshotIsOrphan(const char *dir, const char *filename, bool *orphan) +{ + return uvSnapshotIsOrphanInternal(dir, filename, false, orphan); +} + +int UvSnapshotMetaIsOrphan(const char *dir, const char *filename, bool *orphan) +{ + return uvSnapshotIsOrphanInternal(dir, filename, true, orphan); +} + +/* Compare two snapshots to decide which one is more recent. */ +static int uvSnapshotCompare(const void *p1, const void *p2) +{ + struct uvSnapshotInfo *s1 = (struct uvSnapshotInfo *)p1; + struct uvSnapshotInfo *s2 = (struct uvSnapshotInfo *)p2; + + /* If terms are different, the snapshot with the highest term is the + * most recent. */ + if (s1->term != s2->term) { + return s1->term < s2->term ? -1 : 1; + } + + /* If the term are identical and the index differ, the snapshot with the + * highest index is the most recent */ + if (s1->index != s2->index) { + return s1->index < s2->index ? -1 : 1; + } + + /* If term and index are identical, compare the timestamp. */ + return s1->timestamp < s2->timestamp ? -1 : 1; +} + +/* Sort the given snapshots. */ +void UvSnapshotSort(struct uvSnapshotInfo *infos, size_t n_infos) +{ + qsort(infos, n_infos, sizeof *infos, uvSnapshotCompare); +} + +/* Parse the metadata file of a snapshot and populate the metadata portion of + * the given snapshot object accordingly. */ +static int uvSnapshotLoadMeta(struct uv *uv, + struct uvSnapshotInfo *info, + struct raft_snapshot *snapshot, + char *errmsg) +{ + uint64_t header[1 + /* Format version */ + 1 + /* CRC checksum */ + 1 + /* Configuration index */ + 1 /* Configuration length */]; + struct raft_buffer buf; + uint64_t format; + uint32_t crc1; + uint32_t crc2; + uv_file fd; + int rv; + + snapshot->term = info->term; + snapshot->index = info->index; + + rv = UvFsOpenFileForReading(uv->dir, info->filename, &fd, errmsg); + if (rv != 0) { + tracef("open %s: %s", info->filename, errmsg); + rv = RAFT_IOERR; + goto err; + } + buf.base = header; + buf.len = sizeof header; + rv = UvFsReadInto(fd, &buf, errmsg); + if (rv != 0) { + tracef("read %s: %s", info->filename, errmsg); + rv = RAFT_IOERR; + goto err_after_open; + } + + format = byteFlip64(header[0]); + if (format != UV__DISK_FORMAT) { + tracef("load %s: unsupported format %ju", info->filename, + format); + rv = RAFT_MALFORMED; + goto err_after_open; + } + + crc1 = (uint32_t)byteFlip64(header[1]); + + snapshot->configuration_index = byteFlip64(header[2]); + buf.len = (size_t)byteFlip64(header[3]); + if (buf.len > UV__META_MAX_CONFIGURATION_SIZE) { + tracef("load %s: configuration data too big (%zd)", + info->filename, buf.len); + rv = RAFT_CORRUPT; + goto err_after_open; + } + if (buf.len == 0) { + tracef("load %s: no configuration data", info->filename); + rv = RAFT_CORRUPT; + goto err_after_open; + } + buf.base = RaftHeapMalloc(buf.len); + if (buf.base == NULL) { + rv = RAFT_NOMEM; + goto err_after_open; + } + + rv = UvFsReadInto(fd, &buf, errmsg); + if (rv != 0) { + tracef("read %s: %s", info->filename, errmsg); + rv = RAFT_IOERR; + goto err_after_buf_malloc; + } + + crc2 = byteCrc32(header + 2, sizeof header - sizeof(uint64_t) * 2, 0); + crc2 = byteCrc32(buf.base, buf.len, crc2); + + if (crc1 != crc2) { + ErrMsgPrintf(errmsg, "read %s: checksum mismatch", + info->filename); + rv = RAFT_CORRUPT; + goto err_after_buf_malloc; + } + + rv = configurationDecode(&buf, &snapshot->configuration); + if (rv != 0) { + goto err_after_buf_malloc; + } + + RaftHeapFree(buf.base); + UvOsClose(fd); + + return 0; + +err_after_buf_malloc: + RaftHeapFree(buf.base); + +err_after_open: + close(fd); + +err: + assert(rv != 0); + return rv; +} + +/* Load the snapshot data file and populate the data portion of the given + * snapshot object accordingly. */ +static int uvSnapshotLoadData(struct uv *uv, + struct uvSnapshotInfo *info, + struct raft_snapshot *snapshot, + char *errmsg) +{ + char filename[UV__FILENAME_LEN]; + struct raft_buffer buf; + int rv; + + uvSnapshotFilenameOf(info, filename); + + rv = UvFsReadFile(uv->dir, filename, &buf, errmsg); + if (rv != 0) { + tracef("stat %s: %s", filename, errmsg); + goto err; + } + + if (IsCompressed(buf.base, buf.len)) { + struct raft_buffer decompressed = {0}; + tracef("snapshot decompress start"); + rv = Decompress(buf, &decompressed, errmsg); + tracef("snapshot decompress end %d", rv); + if (rv != 0) { + tracef("decompress failed rv:%d", rv); + goto err_after_read_file; + } + RaftHeapFree(buf.base); + buf = decompressed; + } + + snapshot->bufs = RaftHeapMalloc(sizeof *snapshot->bufs); + snapshot->n_bufs = 1; + if (snapshot->bufs == NULL) { + rv = RAFT_NOMEM; + goto err_after_read_file; + } + + snapshot->bufs[0] = buf; + return 0; + +err_after_read_file: + RaftHeapFree(buf.base); +err: + assert(rv != 0); + return rv; +} + +int UvSnapshotLoad(struct uv *uv, + struct uvSnapshotInfo *meta, + struct raft_snapshot *snapshot, + char *errmsg) +{ + int rv; + rv = uvSnapshotLoadMeta(uv, meta, snapshot, errmsg); + if (rv != 0) { + return rv; + } + rv = uvSnapshotLoadData(uv, meta, snapshot, errmsg); + if (rv != 0) { + return rv; + } + return 0; +} + +struct uvSnapshotPut +{ + struct uv *uv; + size_t trailing; + struct raft_io_snapshot_put *req; + const struct raft_snapshot *snapshot; + struct + { + unsigned long long timestamp; + uint64_t header[4]; /* Format, CRC, configuration index/len */ + struct raft_buffer bufs[2]; /* Preamble and configuration */ + } meta; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int status; + struct UvBarrierReq barrier; +}; + +struct uvSnapshotGet +{ + struct uv *uv; + struct raft_io_snapshot_get *req; + struct raft_snapshot *snapshot; + struct uv_work_s work; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int status; + queue queue; +}; + +static int uvSnapshotKeepLastTwo(struct uv *uv, + struct uvSnapshotInfo *snapshots, + size_t n) +{ + size_t i; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int rv; + + /* Leave at least two snapshots, for safety. */ + if (n <= 2) { + return 0; + } + + for (i = 0; i < n - 2; i++) { + struct uvSnapshotInfo *snapshot = &snapshots[i]; + char filename[UV__FILENAME_LEN]; + rv = UvFsRemoveFile(uv->dir, snapshot->filename, errmsg); + if (rv != 0) { + tracef("unlink %s: %s", snapshot->filename, errmsg); + return RAFT_IOERR; + } + uvSnapshotFilenameOf(snapshot, filename); + rv = UvFsRemoveFile(uv->dir, filename, errmsg); + if (rv != 0) { + tracef("unlink %s: %s", filename, errmsg); + return RAFT_IOERR; + } + } + + return 0; +} + +/* Remove all segments and snapshots that are not needed anymore, because their + past the trailing amount. */ +static int uvRemoveOldSegmentsAndSnapshots(struct uv *uv, + raft_index last_index, + size_t trailing, + char *errmsg) +{ + struct uvSnapshotInfo *snapshots; + struct uvSegmentInfo *segments; + size_t n_snapshots; + size_t n_segments; + int rv = 0; + + rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments, + errmsg); + if (rv != 0) { + goto out; + } + rv = uvSnapshotKeepLastTwo(uv, snapshots, n_snapshots); + if (rv != 0) { + goto out; + } + if (segments != NULL) { + rv = uvSegmentKeepTrailing(uv, segments, n_segments, last_index, + trailing, errmsg); + if (rv != 0) { + goto out; + } + } + rv = UvFsSyncDir(uv->dir, errmsg); + +out: + if (snapshots != NULL) { + RaftHeapFree(snapshots); + } + if (segments != NULL) { + RaftHeapFree(segments); + } + return rv; +} + +static int makeFileCompressed(const char *dir, + const char *filename, + struct raft_buffer *bufs, + unsigned n_bufs, + char *errmsg) +{ + int rv; + + struct raft_buffer compressed = {0}; + rv = Compress(bufs, n_bufs, &compressed, errmsg); + if (rv != 0) { + ErrMsgWrapf(errmsg, "compress %s", filename); + return RAFT_IOERR; + } + + rv = UvFsMakeFile(dir, filename, &compressed, 1, errmsg); + raft_free(compressed.base); + return rv; +} + +static void uvSnapshotPutWorkCb(uv_work_t *work) +{ + struct uvSnapshotPut *put = work->data; + struct uv *uv = put->uv; + char metadata[UV__FILENAME_LEN]; + char snapshot[UV__FILENAME_LEN]; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int rv; + + sprintf(metadata, UV__SNAPSHOT_META_TEMPLATE, put->snapshot->term, + put->snapshot->index, put->meta.timestamp); + + rv = UvFsMakeFile(uv->dir, metadata, put->meta.bufs, 2, put->errmsg); + if (rv != 0) { + tracef("snapshot.meta creation failed %d", rv); + ErrMsgWrapf(put->errmsg, "write %s", metadata); + put->status = RAFT_IOERR; + return; + } + + sprintf(snapshot, UV__SNAPSHOT_TEMPLATE, put->snapshot->term, + put->snapshot->index, put->meta.timestamp); + + tracef("snapshot write start"); + if (uv->snapshot_compression) { + rv = makeFileCompressed(uv->dir, snapshot, put->snapshot->bufs, + put->snapshot->n_bufs, put->errmsg); + } else { + rv = UvFsMakeFile(uv->dir, snapshot, put->snapshot->bufs, + put->snapshot->n_bufs, put->errmsg); + } + tracef("snapshot write end %d", rv); + + if (rv != 0) { + tracef("snapshot creation failed %d", rv); + ErrMsgWrapf(put->errmsg, "write %s", snapshot); + UvFsRemoveFile(uv->dir, metadata, errmsg); + UvFsRemoveFile(uv->dir, snapshot, errmsg); + put->status = RAFT_IOERR; + return; + } + + rv = UvFsSyncDir(uv->dir, put->errmsg); + if (rv != 0) { + put->status = RAFT_IOERR; + return; + } + + rv = uvRemoveOldSegmentsAndSnapshots(uv, put->snapshot->index, + put->trailing, put->errmsg); + if (rv != 0) { + put->status = rv; + return; + } + + put->status = 0; + + return; +} + +/* Finish the put request, releasing all associated memory and invoking its + * callback. */ +static void uvSnapshotPutFinish(struct uvSnapshotPut *put) +{ + struct raft_io_snapshot_put *req = put->req; + int status = put->status; + struct uv *uv = put->uv; + assert(uv->snapshot_put_work.data == NULL); + RaftHeapFree(put->meta.bufs[1].base); + RaftHeapFree(put); + req->cb(req, status); +} + +static void uvSnapshotPutAfterWorkCb(uv_work_t *work, int status) +{ + struct uvSnapshotPut *put = work->data; + struct uv *uv = put->uv; + assert(status == 0); + uv->snapshot_put_work.data = NULL; + uvSnapshotPutFinish(put); + UvUnblock(uv); +} + +/* Start processing the given put request. */ +static void uvSnapshotPutStart(struct uvSnapshotPut *put) +{ + struct uv *uv = put->uv; + int rv; + + /* If this is an install request, the barrier callback must have fired. + */ + if (put->trailing == 0) { + assert(put->barrier.data == NULL); + } + + uv->snapshot_put_work.data = put; + rv = uv_queue_work(uv->loop, &uv->snapshot_put_work, + uvSnapshotPutWorkCb, uvSnapshotPutAfterWorkCb); + if (rv != 0) { + tracef("store snapshot %lld: %s", put->snapshot->index, + uv_strerror(rv)); + uv->errored = true; + } +} + +static void uvSnapshotPutBarrierCb(struct UvBarrierReq *barrier) +{ + /* Ensure that we don't invoke this callback more than once. */ + barrier->cb = NULL; + struct uvSnapshotPut *put = barrier->data; + if (put == NULL) { + return; + } + + struct uv *uv = put->uv; + put->barrier.data = NULL; + /* If we're closing, abort the request. */ + if (uv->closing) { + put->status = RAFT_CANCELED; + uvSnapshotPutFinish(put); + uvMaybeFireCloseCb(uv); + return; + } + uvSnapshotPutStart(put); +} + +int UvSnapshotPut(struct raft_io *io, + unsigned trailing, + struct raft_io_snapshot_put *req, + const struct raft_snapshot *snapshot, + raft_io_snapshot_put_cb cb) +{ + struct uv *uv; + struct uvSnapshotPut *put; + void *cursor; + unsigned crc; + int rv; + raft_index next_index; + + uv = io->impl; + if (uv->closing) { + return RAFT_CANCELED; + } + + assert(uv->snapshot_put_work.data == NULL); + + tracef("put snapshot at %lld, keeping %d", snapshot->index, trailing); + + put = RaftHeapMalloc(sizeof *put); + if (put == NULL) { + rv = RAFT_NOMEM; + goto err; + } + put->uv = uv; + put->req = req; + put->snapshot = snapshot; + put->meta.timestamp = uv_now(uv->loop); + put->trailing = trailing; + put->barrier.data = put; + put->barrier.blocking = trailing == 0; + put->barrier.cb = uvSnapshotPutBarrierCb; + + req->cb = cb; + + /* Prepare the buffers for the metadata file. */ + put->meta.bufs[0].base = put->meta.header; + put->meta.bufs[0].len = sizeof put->meta.header; + + rv = configurationEncode(&snapshot->configuration, &put->meta.bufs[1]); + if (rv != 0) { + goto err_after_req_alloc; + } + + cursor = put->meta.header; + bytePut64(&cursor, UV__DISK_FORMAT); + bytePut64(&cursor, 0); + bytePut64(&cursor, snapshot->configuration_index); + bytePut64(&cursor, put->meta.bufs[1].len); + + crc = byteCrc32(&put->meta.header[2], sizeof(uint64_t) * 2, 0); + crc = byteCrc32(put->meta.bufs[1].base, put->meta.bufs[1].len, crc); + + cursor = &put->meta.header[1]; + bytePut64(&cursor, crc); + + /* - If the trailing parameter is set to 0, it means that we're + * restoring a snapshot. Submit a barrier request setting the next + * append index to the snapshot's last index + 1. + * - When we are only writing a snapshot during normal operation, we + * close all current open segments. New writes can continue on newly + * opened segments that will only contain entries that are newer than + * the snapshot, and we don't change append_next_index. */ + next_index = + (trailing == 0) ? (snapshot->index + 1) : uv->append_next_index; + rv = UvBarrier(uv, next_index, &put->barrier); + if (rv != 0) { + goto err_after_configuration_encode; + } + + return 0; + +err_after_configuration_encode: + RaftHeapFree(put->meta.bufs[1].base); +err_after_req_alloc: + RaftHeapFree(put); +err: + assert(rv != 0); + return rv; +} + +static void uvSnapshotGetWorkCb(uv_work_t *work) +{ + struct uvSnapshotGet *get = work->data; + struct uv *uv = get->uv; + struct uvSnapshotInfo *snapshots; + size_t n_snapshots; + struct uvSegmentInfo *segments; + size_t n_segments; + int rv; + get->status = 0; + rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments, + get->errmsg); + if (rv != 0) { + get->status = rv; + goto out; + } + if (snapshots != NULL) { + rv = UvSnapshotLoad(uv, &snapshots[n_snapshots - 1], + get->snapshot, get->errmsg); + if (rv != 0) { + get->status = rv; + } + RaftHeapFree(snapshots); + } + if (segments != NULL) { + RaftHeapFree(segments); + } +out: + return; +} + +static void uvSnapshotGetAfterWorkCb(uv_work_t *work, int status) +{ + struct uvSnapshotGet *get = work->data; + struct raft_io_snapshot_get *req = get->req; + struct raft_snapshot *snapshot = get->snapshot; + int req_status = get->status; + struct uv *uv = get->uv; + assert(status == 0); + QUEUE_REMOVE(&get->queue); + RaftHeapFree(get); + req->cb(req, snapshot, req_status); + uvMaybeFireCloseCb(uv); +} + +int UvSnapshotGet(struct raft_io *io, + struct raft_io_snapshot_get *req, + raft_io_snapshot_get_cb cb) +{ + struct uv *uv; + struct uvSnapshotGet *get; + int rv; + + uv = io->impl; + assert(!uv->closing); + + get = RaftHeapMalloc(sizeof *get); + if (get == NULL) { + rv = RAFT_NOMEM; + goto err; + } + get->uv = uv; + get->req = req; + req->cb = cb; + + get->snapshot = RaftHeapMalloc(sizeof *get->snapshot); + if (get->snapshot == NULL) { + rv = RAFT_NOMEM; + goto err_after_req_alloc; + } + get->work.data = get; + + QUEUE_PUSH(&uv->snapshot_get_reqs, &get->queue); + rv = uv_queue_work(uv->loop, &get->work, uvSnapshotGetWorkCb, + uvSnapshotGetAfterWorkCb); + if (rv != 0) { + QUEUE_REMOVE(&get->queue); + tracef("get last snapshot: %s", uv_strerror(rv)); + rv = RAFT_IOERR; + goto err_after_snapshot_alloc; + } + + return 0; + +err_after_snapshot_alloc: + RaftHeapFree(get->snapshot); +err_after_req_alloc: + RaftHeapFree(get); +err: + assert(rv != 0); + return rv; +} + +#undef tracef diff --git a/src/raft/uv_tcp.c b/src/raft/uv_tcp.c new file mode 100644 index 000000000..4196b9f56 --- /dev/null +++ b/src/raft/uv_tcp.c @@ -0,0 +1,127 @@ +#include "uv_tcp.h" +#include "uv_ip.h" + +#include + +#include "../raft.h" +#include "assert.h" +#include "err.h" +#include "heap.h" + +/* Implementation of raft_uv_transport->init. */ +static int uvTcpInit(struct raft_uv_transport *transport, + raft_id id, + const char *address) +{ + struct UvTcp *t = transport->impl; + assert(id > 0); + assert(address != NULL); + t->id = id; + t->address = address; + return 0; +} + +/* Implementation of raft_uv_transport->close. */ +static void uvTcpClose(struct raft_uv_transport *transport, + raft_uv_transport_close_cb cb) +{ + struct UvTcp *t = transport->impl; + assert(!t->closing); + t->closing = true; + t->close_cb = cb; + UvTcpListenClose(t); + UvTcpConnectClose(t); + UvTcpMaybeFireCloseCb(t); +} + +void UvTcpMaybeFireCloseCb(struct UvTcp *t) +{ + if (!t->closing) { + return; + } + + assert(QUEUE_IS_EMPTY(&t->accepting)); + assert(QUEUE_IS_EMPTY(&t->connecting)); + if (!QUEUE_IS_EMPTY(&t->aborting)) { + return; + } + + if (t->listeners != NULL) { + return; + } + + if (t->close_cb != NULL) { + t->close_cb(t->transport); + } +} + +int raft_uv_tcp_init(struct raft_uv_transport *transport, + struct uv_loop_s *loop) +{ + struct UvTcp *t; + void *data = transport->data; + int version = transport->version; + if (version != 1) { + ErrMsgPrintf(transport->errmsg, "Invalid version: %d", version); + return RAFT_INVALID; + } + + memset(transport, 0, sizeof *transport); + transport->data = data; + transport->version = version; + t = raft_malloc(sizeof *t); + if (t == NULL) { + ErrMsgOom(transport->errmsg); + return RAFT_NOMEM; + } + t->transport = transport; + t->loop = loop; + t->id = 0; + t->address = NULL; + t->bind_address = NULL; + t->listeners = NULL; + t->n_listeners = 0; + t->accept_cb = NULL; + QUEUE_INIT(&t->accepting); + QUEUE_INIT(&t->connecting); + QUEUE_INIT(&t->aborting); + t->closing = false; + t->close_cb = NULL; + + transport->impl = t; + transport->init = uvTcpInit; + transport->close = uvTcpClose; + transport->listen = UvTcpListen; + transport->connect = UvTcpConnect; + + return 0; +} + +void raft_uv_tcp_close(struct raft_uv_transport *transport) +{ + struct UvTcp *t = transport->impl; + raft_free(t->bind_address); + raft_free(t); +} + +int raft_uv_tcp_set_bind_address(struct raft_uv_transport *transport, + const char *address) +{ + struct UvTcp *t = transport->impl; + char hostname[NI_MAXHOST]; + char service[NI_MAXSERV]; + int rv; + + rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service, + sizeof(service)); + if (rv != 0) { + return RAFT_INVALID; + } + + t->bind_address = raft_malloc(strlen(address) + 1); + if (t->bind_address == NULL) { + return RAFT_NOMEM; + } + strcpy(t->bind_address, address); + return 0; +} diff --git a/src/raft/uv_tcp.h b/src/raft/uv_tcp.h new file mode 100644 index 000000000..924f4f5b3 --- /dev/null +++ b/src/raft/uv_tcp.h @@ -0,0 +1,48 @@ +#ifndef UV_TCP_H_ +#define UV_TCP_H_ + +#include "../raft.h" +#include "queue.h" + +/* Protocol version. */ +#define UV__TCP_HANDSHAKE_PROTOCOL 1 + +struct UvTcp +{ + struct raft_uv_transport *transport; /* Interface object we implement */ + struct uv_loop_s *loop; /* Event loop */ + raft_id id; /* ID of this raft server */ + const char *address; /* Address of this raft server */ + unsigned n_listeners; /* Number of listener sockets */ + struct uv_tcp_s *listeners; /* Listener sockets */ + raft_uv_accept_cb accept_cb; /* Call after accepting a connection */ + queue accepting; /* Connections being accepted */ + queue connecting; /* Pending connection requests */ + queue aborting; /* Connections being aborted */ + bool closing; /* True after close() is called */ + raft_uv_transport_close_cb + close_cb; /* Call when it's safe to free us */ + char *bind_address; /* Optional address:port to bind to */ +}; + +/* Implementation of raft_uv_transport->listen. */ +int UvTcpListen(struct raft_uv_transport *transport, raft_uv_accept_cb cb); + +/* Stop accepting new connection and close all connections being accepted. */ +void UvTcpListenClose(struct UvTcp *t); + +/* Implementation of raft_uv_transport->connect. */ +int UvTcpConnect(struct raft_uv_transport *transport, + struct raft_uv_connect *req, + raft_id id, + const char *address, + raft_uv_connect_cb cb); + +/* Abort all pending connection requests. */ +void UvTcpConnectClose(struct UvTcp *t); + +/* Fire the transport close callback if the transport is closing and there's no + * more pending callback. */ +void UvTcpMaybeFireCloseCb(struct UvTcp *t); + +#endif /* UV_TCP_H_ */ diff --git a/src/raft/uv_tcp_connect.c b/src/raft/uv_tcp_connect.c new file mode 100644 index 000000000..e493d14a8 --- /dev/null +++ b/src/raft/uv_tcp_connect.c @@ -0,0 +1,382 @@ +#include + +#include "assert.h" +#include "byte.h" +#include "err.h" +#include "heap.h" +#include "uv_ip.h" +#include "uv_tcp.h" + +/* The happy path of a connection request is: + * + * - Create a TCP handle and submit a TCP connect request. + * - Initiate an asynchronous dns resolve request + * - Once the name lookup was successfull connect to the first given IP + * - Once connected over TCP, submit a write request for the handshake. + * - Once the write completes, fire the connection request callback. + * + * Alternative happy path of a connection request, if hostname resolves to + * multiple IPs and first/second/... IP is reachable: + * - close the tcp handle and initiate a new connect with next IP in cb + * + * Possible failure modes are: + * + * - The name resolve for the hostname is not sucessfull, close the TCP handle + * and fire the request callback. + * + * - The transport get closed, close the TCP handle and and fire the request + * callback with RAFT_CANCELED. + * + * - Either the TCP connect or the write request fails: close the TCP handle and + * fire the request callback with RAFT_NOCONNECTION. + */ + +/* Hold state for a single connection request. */ +struct uvTcpConnect +{ + struct UvTcp *t; /* Transport implementation */ + struct raft_uv_connect *req; /* User request */ + uv_buf_t handshake; /* Handshake data */ + struct uv_tcp_s *tcp; /* TCP connection socket handle */ + struct uv_getaddrinfo_s getaddrinfo; /* DNS resolve request */ + const struct addrinfo + *ai_current; /* The current sockaddr to connect to */ + struct uv_connect_s connect; /* TCP connection request */ + struct uv_write_s write; /* TCP handshake request */ + int status; /* Returned to the request callback */ + bool resolving; /* Indicate name resolving in progress */ + bool retry; /* Indicate tcp connect failure handling */ + queue queue; /* Pending connect queue */ +}; + +/* Encode an handshake message into the given buffer. */ +static int uvTcpEncodeHandshake(raft_id id, const char *address, uv_buf_t *buf) +{ + void *cursor; + size_t address_len = bytePad64(strlen(address) + 1); + buf->len = sizeof(uint64_t) + /* Protocol version. */ + sizeof(uint64_t) + /* Server ID. */ + sizeof(uint64_t) /* Size of the address buffer */; + buf->len += address_len; + buf->base = RaftHeapMalloc(buf->len); + if (buf->base == NULL) { + return RAFT_NOMEM; + } + cursor = buf->base; + bytePut64(&cursor, UV__TCP_HANDSHAKE_PROTOCOL); + bytePut64(&cursor, id); + bytePut64(&cursor, address_len); + strcpy(cursor, address); + return 0; +} + +/* Finish the connect request, releasing its memory and firing the connect + * callback. */ +static void uvTcpConnectFinish(struct uvTcpConnect *connect) +{ + struct uv_stream_s *stream = (struct uv_stream_s *)connect->tcp; + struct raft_uv_connect *req = connect->req; + int status = connect->status; + QUEUE_REMOVE(&connect->queue); + RaftHeapFree(connect->handshake.base); + uv_freeaddrinfo(connect->getaddrinfo.addrinfo); + raft_free(connect); + req->cb(req, stream, status); +} + +/* The TCP connection handle has been closed in consequence of an error or + * because the transport is closing. */ +static void uvTcpConnectUvCloseCb(struct uv_handle_s *handle) +{ + struct uvTcpConnect *connect = handle->data; + struct UvTcp *t = connect->t; + assert(connect->status != 0); + assert(handle == (struct uv_handle_s *)connect->tcp); + RaftHeapFree(connect->tcp); + connect->tcp = NULL; + uvTcpConnectFinish(connect); + UvTcpMaybeFireCloseCb(t); +} + +/* Abort a connection request. */ +static void uvTcpConnectAbort(struct uvTcpConnect *connect) +{ + QUEUE_REMOVE(&connect->queue); + QUEUE_PUSH(&connect->t->aborting, &connect->queue); + uv_cancel((struct uv_req_s *)&connect->getaddrinfo); + /* Call uv_close on the tcp handle, if there is no getaddrinfo request + * in flight and the handle is not currently closed due to next IP + * connect attempt. + * Data structures may only be freed after the uvGetAddrInfoCb was + * triggered. Tcp handle will be closed in the uvGetAddrInfoCb in this + * case. uvTcpConnectUvCloseCb will be invoked from + * uvTcpTryNextConnectCb in case a next IP connect should be started. */ + if (!connect->resolving && !connect->retry) { + uv_close((struct uv_handle_s *)connect->tcp, + uvTcpConnectUvCloseCb); + } +} + +/* The handshake TCP write completes. Fire the connect callback. */ +static void uvTcpConnectUvWriteCb(struct uv_write_s *write, int status) +{ + struct uvTcpConnect *connect = write->data; + struct UvTcp *t = connect->t; + + if (t->closing) { + connect->status = RAFT_CANCELED; + return; + } + + if (status != 0) { + assert(status != + UV_ECANCELED); /* t->closing would have been true */ + connect->status = RAFT_NOCONNECTION; + uvTcpConnectAbort(connect); + return; + } + + uvTcpConnectFinish(connect); +} + +/* Helper function to connect to the remote node */ +static void uvTcpAsyncConnect(struct uvTcpConnect *connect); + +/* The TCP connect failed, we closed the handle and want to try with next IP */ +static void uvTcpTryNextConnectCb(struct uv_handle_s *handle) +{ + struct uvTcpConnect *connect = handle->data; + struct UvTcp *t = connect->t; + int rv; + + connect->retry = false; + + if (t->closing) { + connect->status = RAFT_CANCELED; + /* We are already in close cb for the tcp handle, simply invoke + * final cb + */ + uvTcpConnectUvCloseCb(handle); + return; + } + rv = uv_tcp_init(t->loop, connect->tcp); + assert(rv == 0); + uvTcpAsyncConnect(connect); +} + +/* The TCP connection is established. Write the handshake data. */ +static void uvTcpConnectUvConnectCb(struct uv_connect_s *req, int status) +{ + struct uvTcpConnect *connect = req->data; + struct UvTcp *t = connect->t; + int rv; + + if (t->closing) { + connect->status = RAFT_CANCELED; + return; + } + + if (status != 0) { + assert(status != + UV_ECANCELED); /* t->closing would have been true */ + connect->ai_current = connect->ai_current->ai_next; + if (connect->ai_current) { + /* For the next connect attempt we need to close the tcp + * handle. */ + /* To avoid interference with aborting we set a flag to + * indicate the connect attempt */ + connect->retry = true; + uv_close((struct uv_handle_s *)connect->tcp, + uvTcpTryNextConnectCb); + return; + } + connect->status = RAFT_NOCONNECTION; + ErrMsgPrintf(t->transport->errmsg, "uv_tcp_connect(): %s", + uv_strerror(status)); + goto err; + } + + rv = uv_write(&connect->write, (struct uv_stream_s *)connect->tcp, + &connect->handshake, 1, uvTcpConnectUvWriteCb); + if (rv != 0) { + /* UNTESTED: what are the error conditions? perhaps ENOMEM */ + connect->status = RAFT_NOCONNECTION; + goto err; + } + + return; + +err: + uvTcpConnectAbort(connect); +} + +/* Helper function to connect to the remote node */ +static void uvTcpAsyncConnect(struct uvTcpConnect *connect) +{ + int rv; + rv = uv_tcp_connect(&connect->connect, connect->tcp, + connect->ai_current->ai_addr, + uvTcpConnectUvConnectCb); + if (rv != 0) { + /* UNTESTED: since parsing succeed, this should fail only + * because of lack of system resources */ + ErrMsgPrintf(connect->t->transport->errmsg, + "uv_tcp_connect(): %s", uv_strerror(rv)); + connect->status = RAFT_NOCONNECTION; + uvTcpConnectAbort(connect); + } +} + +/* The hostname resolve is finished */ +static void uvGetAddrInfoCb(uv_getaddrinfo_t *req, + int status, + struct addrinfo *res) +{ + struct uvTcpConnect *connect = req->data; + struct UvTcp *t = connect->t; + + connect->resolving = + false; /* Indicate we are in the name resolving phase */ + + if (t->closing) { + connect->status = RAFT_CANCELED; + + /* We need to close the tcp handle to abort connection attempt + */ + uv_close((struct uv_handle_s *)connect->tcp, + uvTcpConnectUvCloseCb); + return; + } + + if (status < 0) { + ErrMsgPrintf(t->transport->errmsg, "uv_getaddrinfo(): %s", + uv_err_name(status)); + connect->status = RAFT_NOCONNECTION; + uvTcpConnectAbort(connect); + return; + } + connect->ai_current = res; + uvTcpAsyncConnect(connect); +} +/* Create a new TCP handle and submit a connection request to the event loop. */ +static int uvTcpConnectStart(struct uvTcpConnect *r, const char *address) +{ + static struct addrinfo hints = {.ai_flags = AI_V4MAPPED | AI_ADDRCONFIG, + .ai_family = AF_INET, + .ai_socktype = SOCK_STREAM, + .ai_protocol = 0}; + struct UvTcp *t = r->t; + char hostname[NI_MAXHOST]; + char service[NI_MAXSERV]; + int rv; + + r->handshake.base = NULL; + + /* Initialize the handshake buffer. */ + rv = uvTcpEncodeHandshake(t->id, t->address, &r->handshake); + if (rv != 0) { + assert(rv == RAFT_NOMEM); + ErrMsgOom(t->transport->errmsg); + goto err; + } + + r->tcp = RaftHeapMalloc(sizeof *r->tcp); + if (r->tcp == NULL) { + ErrMsgOom(t->transport->errmsg); + rv = RAFT_NOMEM; + goto err; + } + + rv = uv_tcp_init(r->t->loop, r->tcp); + assert(rv == 0); + r->tcp->data = r; + + rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service, + sizeof(service)); + if (rv) { + ErrMsgPrintf( + t->transport->errmsg, + "uv_tcp_connect(): Cannot split %s into host and service", + address); + rv = RAFT_NOCONNECTION; + goto err_after_tcp_init; + } + rv = uv_getaddrinfo(r->t->loop, &r->getaddrinfo, &uvGetAddrInfoCb, + hostname, service, &hints); + if (rv) { + ErrMsgPrintf(t->transport->errmsg, + "uv_tcp_connect(): Cannot initiate getaddrinfo %s", + uv_strerror(rv)); + rv = RAFT_NOCONNECTION; + goto err_after_tcp_init; + } + r->resolving = true; /* Indicate we are in the name resolving phase */ + + return 0; + +err_after_tcp_init: + uv_close((uv_handle_t *)r->tcp, (uv_close_cb)RaftHeapFree); + +err: + RaftHeapFree(r->handshake.base); + + return rv; +} + +int UvTcpConnect(struct raft_uv_transport *transport, + struct raft_uv_connect *req, + raft_id id, + const char *address, + raft_uv_connect_cb cb) +{ + struct UvTcp *t = transport->impl; + struct uvTcpConnect *r; + int rv; + (void)id; + assert(!t->closing); + + /* Create and initialize a new TCP connection request object */ + r = RaftHeapMalloc(sizeof *r); + if (r == NULL) { + rv = RAFT_NOMEM; + ErrMsgOom(transport->errmsg); + goto err; + } + r->t = t; + r->req = req; + r->status = 0; + r->write.data = r; + r->getaddrinfo.data = r; + r->resolving = false; + r->retry = false; + r->connect.data = r; + req->cb = cb; + + /* Keep track of the pending request */ + QUEUE_PUSH(&t->connecting, &r->queue); + + /* Start connecting */ + rv = uvTcpConnectStart(r, address); + if (rv != 0) { + goto err_after_alloc; + } + + return 0; + +err_after_alloc: + QUEUE_REMOVE(&r->queue); + RaftHeapFree(r); +err: + return rv; +} + +void UvTcpConnectClose(struct UvTcp *t) +{ + while (!QUEUE_IS_EMPTY(&t->connecting)) { + struct uvTcpConnect *connect; + queue *head; + head = QUEUE_HEAD(&t->connecting); + connect = QUEUE_DATA(head, struct uvTcpConnect, queue); + uvTcpConnectAbort(connect); + } +} diff --git a/src/raft/uv_tcp_listen.c b/src/raft/uv_tcp_listen.c new file mode 100644 index 000000000..41b6ca1ad --- /dev/null +++ b/src/raft/uv_tcp_listen.c @@ -0,0 +1,427 @@ +#include + +#include "assert.h" +#include "byte.h" +#include "heap.h" +#include "uv_ip.h" +#include "uv_tcp.h" + +/* The happy path of an incoming connection is: + * + * - The connection callback is fired on the listener TCP handle, and the + * incoming connection is uv_accept()'ed. We call uv_read_start() to get + * notified about received handshake data. + * + * - Once the preamble is received, we start waiting for the server address. + * + * - Once the server address is received, we fire the receive callback. + * + * Possible failure modes are: + * + * - The accept process gets canceled in the transport->close() implementation, + * by calling tcp_accept_stop(): the incoming TCP connection handle gets + * closed, preventing any further handshake data notification, and all + * allocated memory gets released in the handle close callback. + */ + +/* Hold state for a connection being accepted. */ +struct uvTcpHandshake +{ + uint64_t preamble[3]; /* Preamble buffer */ + uv_buf_t address; /* Address buffer */ + size_t nread; /* Number of bytes read */ +}; + +/* Hold handshake data for a new connection being established. */ +struct uvTcpIncoming +{ + struct UvTcp *t; /* Transport implementation */ + struct uv_tcp_s + *listener; /* The tcp handle, which accepted this socket */ + struct uv_tcp_s *tcp; /* TCP connection socket handle */ + struct uvTcpHandshake handshake; /* Handshake data */ + queue queue; /* Pending accept queue */ +}; + +/* Decode the handshake preamble, containing the protocol version, the ID of the + * connecting server and the length of its address. Also, allocate the buffer to + * start reading the server address. */ +static int uvTcpDecodePreamble(struct uvTcpHandshake *h) +{ + uint64_t protocol; + protocol = byteFlip64(h->preamble[0]); + if (protocol != UV__TCP_HANDSHAKE_PROTOCOL) { + return RAFT_MALFORMED; + } + h->address.len = (size_t)byteFlip64(h->preamble[2]); + h->address.base = RaftHeapMalloc(h->address.len); + if (h->address.base == NULL) { + return RAFT_NOMEM; + } + h->nread = 0; + return 0; +} + +/* The accepted TCP client connection has been closed, release all memory + * associated with accept object. We can get here only if an error occurrent + * during the handshake or if raft_uv_transport->close() has been invoked. */ +static void uvTcpIncomingCloseCb(struct uv_handle_s *handle) +{ + struct uvTcpIncoming *incoming = handle->data; + struct UvTcp *t = incoming->t; + QUEUE_REMOVE(&incoming->queue); + if (incoming->handshake.address.base != NULL) { + RaftHeapFree(incoming->handshake.address.base); + } + RaftHeapFree(incoming->tcp); + RaftHeapFree(incoming); + UvTcpMaybeFireCloseCb(t); +} + +/* Close an incoming TCP connection which hasn't complete the handshake yet. */ +static void uvTcpIncomingAbort(struct uvTcpIncoming *incoming) +{ + struct UvTcp *t = incoming->t; + /* After uv_close() returns we are guaranteed that no more alloc_cb or + * read_cb will be called. */ + QUEUE_REMOVE(&incoming->queue); + QUEUE_PUSH(&t->aborting, &incoming->queue); + uv_close((struct uv_handle_s *)incoming->tcp, uvTcpIncomingCloseCb); +} + +/* Read the address part of the handshake. */ +static void uvTcpIncomingAllocCbAddress(struct uv_handle_s *handle, + size_t suggested_size, + uv_buf_t *buf) +{ + struct uvTcpIncoming *incoming = handle->data; + (void)suggested_size; + assert(!incoming->t->closing); + buf->base = + incoming->handshake.address.base + incoming->handshake.nread; + buf->len = incoming->handshake.address.len - incoming->handshake.nread; +} + +static void uvTcpIncomingReadCbAddress(uv_stream_t *stream, + ssize_t nread, + const uv_buf_t *buf) +{ + struct uvTcpIncoming *incoming = stream->data; + char *address; + raft_id id; + size_t n; + int rv; + + (void)buf; + assert(!incoming->t->closing); + + if (nread == 0) { + /* Empty read just ignore it. */ + return; + } + if (nread < 0) { + uvTcpIncomingAbort(incoming); + return; + } + + /* We shouldn't have read more data than the pending amount. */ + n = (size_t)nread; + assert(n <= + incoming->handshake.address.len - incoming->handshake.nread); + + /* Advance the read window */ + incoming->handshake.nread += n; + + /* If there's more data to read in order to fill the current + * read buffer, just return, we'll be invoked again. */ + if (incoming->handshake.nread < incoming->handshake.address.len) { + return; + } + + /* If we have completed reading the address, let's fire the callback. */ + rv = uv_read_stop(stream); + assert(rv == 0); + id = byteFlip64(incoming->handshake.preamble[1]); + address = incoming->handshake.address.base; + QUEUE_REMOVE(&incoming->queue); + incoming->t->accept_cb(incoming->t->transport, id, address, + (struct uv_stream_s *)incoming->tcp); + RaftHeapFree(incoming->handshake.address.base); + RaftHeapFree(incoming); +} + +/* Read the preamble of the handshake. */ +static void uvTcpIncomingAllocCbPreamble(struct uv_handle_s *handle, + size_t suggested_size, + uv_buf_t *buf) +{ + struct uvTcpIncoming *incoming = handle->data; + (void)suggested_size; + buf->base = + (char *)incoming->handshake.preamble + incoming->handshake.nread; + buf->len = + sizeof incoming->handshake.preamble - incoming->handshake.nread; +} + +static void uvTcpIncomingReadCbPreamble(uv_stream_t *stream, + ssize_t nread, + const uv_buf_t *buf) +{ + struct uvTcpIncoming *incoming = stream->data; + size_t n; + int rv; + + (void)buf; + + if (nread == 0) { + /* Empty read just ignore it. */ + return; + } + if (nread < 0) { + uvTcpIncomingAbort(incoming); + return; + } + + /* We shouldn't have read more data than the pending amount. */ + n = (size_t)nread; + assert(n <= + sizeof incoming->handshake.preamble - incoming->handshake.nread); + + /* Advance the read window */ + incoming->handshake.nread += n; + + /* If there's more data to read in order to fill the current + * read buffer, just return, we'll be invoked again. */ + if (incoming->handshake.nread < sizeof incoming->handshake.preamble) { + return; + } + + /* If we have completed reading the preamble, let's parse it. */ + rv = uvTcpDecodePreamble(&incoming->handshake); + if (rv != 0) { + uvTcpIncomingAbort(incoming); + return; + } + + rv = uv_read_stop(stream); + assert(rv == 0); + rv = uv_read_start((uv_stream_t *)incoming->tcp, + uvTcpIncomingAllocCbAddress, + uvTcpIncomingReadCbAddress); + assert(rv == 0); +} + +/* Start reading handshake data for a new incoming connection. */ +static int uvTcpIncomingStart(struct uvTcpIncoming *incoming) +{ + int rv; + + memset(&incoming->handshake, 0, sizeof incoming->handshake); + + incoming->tcp = RaftHeapMalloc(sizeof *incoming->tcp); + if (incoming->tcp == NULL) { + return RAFT_NOMEM; + } + incoming->tcp->data = incoming; + + rv = uv_tcp_init(incoming->t->loop, incoming->tcp); + assert(rv == 0); + + rv = uv_accept((struct uv_stream_s *)incoming->listener, + (struct uv_stream_s *)incoming->tcp); + if (rv != 0) { + rv = RAFT_IOERR; + goto err_after_tcp_init; + } + rv = uv_read_start((uv_stream_t *)incoming->tcp, + uvTcpIncomingAllocCbPreamble, + uvTcpIncomingReadCbPreamble); + assert(rv == 0); + + return 0; + +err_after_tcp_init: + uv_close((uv_handle_t *)incoming->tcp, (uv_close_cb)RaftHeapFree); + return rv; +} + +#define IS_IN_ARRAY(elem, array, array_size) \ + (const char *)(elem) >= (const char *)(array) && \ + (const char *)(elem) < \ + (const char *)(array) + array_size * sizeof(*array) + +/* Called when there's a new incoming connection: create a new tcp_accept object + * and start receiving handshake data. */ +static void uvTcpListenCb(struct uv_stream_s *stream, int status) +{ + struct UvTcp *t = stream->data; + struct uvTcpIncoming *incoming; + int rv; + + assert(IS_IN_ARRAY(stream, t->listeners, t->n_listeners)); + + if (status != 0) { + rv = RAFT_IOERR; + goto err; + } + + incoming = RaftHeapMalloc(sizeof *incoming); + if (incoming == NULL) { + rv = RAFT_NOMEM; + goto err; + } + incoming->t = t; + incoming->listener = (struct uv_tcp_s *)stream; + incoming->tcp = NULL; + + QUEUE_PUSH(&t->accepting, &incoming->queue); + + rv = uvTcpIncomingStart(incoming); + if (rv != 0) { + goto err_after_accept_alloc; + } + + return; + +err_after_accept_alloc: + QUEUE_REMOVE(&incoming->queue); + RaftHeapFree(incoming); +err: + assert(rv != 0); +} + +/* Do bind/listen call on the tcp handle */ +static int uvTcpBindListen(struct uv_tcp_s *listener, struct sockaddr *addr) +{ + if (uv_tcp_bind(listener, addr, 0) || + uv_listen((uv_stream_t *)listener, 1, uvTcpListenCb)) { + return RAFT_IOERR; + } + return 0; +} + +/* Create a tcp handle and do bind/listen for each IP */ +static int uvTcpListenOnMultipleIP(struct raft_uv_transport *transport, + struct addrinfo *addr_infos) +{ + struct UvTcp *t; + struct addrinfo *current; + unsigned n_listeners; + int rv; + + t = transport->impl; + + n_listeners = 0; + for (current = addr_infos; current; current = current->ai_next) { + ++n_listeners; + } + + current = addr_infos; + t->listeners = raft_malloc(n_listeners * sizeof(*t->listeners)); + if (!t->listeners) { + rv = RAFT_NOMEM; + goto err; + } + + t->n_listeners = n_listeners; + for (n_listeners = 0; n_listeners < t->n_listeners; ++n_listeners) { + struct uv_tcp_s *listener = &t->listeners[n_listeners]; + listener->data = t; + if (uv_tcp_init(t->loop, listener) || + uvTcpBindListen(listener, current->ai_addr)) { + rv = RAFT_IOERR; + goto err; + } + current = addr_infos->ai_next; + } + return 0; + +err: + if (t->listeners) { + for (unsigned i = 0; i <= n_listeners; ++i) { + uv_close((struct uv_handle_s *)&t->listeners[i], NULL); + } + raft_free(t->listeners); + t->listeners = NULL; + t->n_listeners = 0; + } + return rv; +} + +/* Ignore duplicate entries from glibc getaddrinfo due to + * https://bugzilla.redhat.com/show_bug.cgi?id=496300 + * in case of resolving localhost */ +static bool uvIsAddressDuplication(struct addrinfo *addr_info) +{ + struct addrinfo *next = addr_info->ai_next; + + /* Check, if we have a list of length 2 */ + if (!next || next->ai_next) { + return false; + } + if (addr_info->ai_addrlen != next->ai_addrlen || + bcmp(addr_info->ai_addr, next->ai_addr, addr_info->ai_addrlen)) { + return false; + } + return true; +} + +int UvTcpListen(struct raft_uv_transport *transport, raft_uv_accept_cb cb) +{ + struct UvTcp *t; + struct addrinfo *addr_infos; + int rv; + + t = transport->impl; + t->accept_cb = cb; + + if (t->bind_address == NULL) { + rv = uvIpResolveBindAddresses(t->address, &addr_infos); + } else { + rv = uvIpResolveBindAddresses(t->bind_address, &addr_infos); + } + if (rv != 0 || !addr_infos) { + return rv; + } + if (addr_infos->ai_next && uvIsAddressDuplication(addr_infos)) { + rv = uvTcpListenOnMultipleIP(transport, addr_infos->ai_next); + } else { + rv = uvTcpListenOnMultipleIP(transport, addr_infos); + } + freeaddrinfo(addr_infos); + return rv; +} + +/* Close callback for uvTcp->listener. */ +static void uvTcpListenCloseCbListener(struct uv_handle_s *handle) +{ + struct UvTcp *t = handle->data; + assert(t->closing); + assert(t->n_listeners); + assert(t->listeners); + if (--t->n_listeners == 0) { + raft_free(t->listeners); + t->listeners = NULL; + UvTcpMaybeFireCloseCb(t); + } +} + +void UvTcpListenClose(struct UvTcp *t) +{ + queue *head; + assert(t->closing); + + while (!QUEUE_IS_EMPTY(&t->accepting)) { + struct uvTcpIncoming *incoming; + head = QUEUE_HEAD(&t->accepting); + incoming = QUEUE_DATA(head, struct uvTcpIncoming, queue); + uvTcpIncomingAbort(incoming); + } + + if (t->n_listeners) { + for (unsigned i = 0; i < t->n_listeners; ++i) { + uv_close((struct uv_handle_s *)&t->listeners[i], + uvTcpListenCloseCbListener); + } + } +} diff --git a/src/raft/uv_truncate.c b/src/raft/uv_truncate.c new file mode 100644 index 000000000..51bd84fcb --- /dev/null +++ b/src/raft/uv_truncate.c @@ -0,0 +1,200 @@ +#include +#include + +#include "assert.h" +#include "byte.h" +#include "heap.h" +#include "uv.h" +#include "uv_encoding.h" + +/* Track a truncate request. */ +struct uvTruncate +{ + struct uv *uv; + struct UvBarrierReq barrier; + raft_index index; + int status; +}; + +/* Execute a truncate request in a thread. */ +static void uvTruncateWorkCb(uv_work_t *work) +{ + struct uvTruncate *truncate = work->data; + struct uv *uv = truncate->uv; + tracef("uv truncate work cb"); + struct uvSnapshotInfo *snapshots; + struct uvSegmentInfo *segments; + struct uvSegmentInfo *segment; + size_t n_snapshots; + size_t n_segments; + size_t i; + size_t j; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + int rv; + + /* Load all segments on disk. */ + rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments, + errmsg); + if (rv != 0) { + goto err; + } + if (snapshots != NULL) { + RaftHeapFree(snapshots); + } + assert(segments != NULL); + + /* Find the segment that contains the truncate point. */ + segment = NULL; /* Suppress warnings. */ + for (i = 0; i < n_segments; i++) { + segment = &segments[i]; + if (segment->is_open) { + continue; + } + if (truncate->index >= segment->first_index && + truncate->index <= segment->end_index) { + break; + } + } + assert(i < n_segments); + + /* If the truncate index is not the first of the segment, we need to + * truncate it. */ + if (truncate->index > segment->first_index) { + rv = uvSegmentTruncate(uv, segment, truncate->index); + if (rv != 0) { + goto err_after_list; + } + } + + /* Remove all closed segments past the one containing the truncate + * index. */ + for (j = i; j < n_segments; j++) { + segment = &segments[j]; + if (segment->is_open) { + continue; + } + rv = UvFsRemoveFile(uv->dir, segment->filename, errmsg); + if (rv != 0) { + tracef("unlink segment %s: %s", segment->filename, + errmsg); + rv = RAFT_IOERR; + goto err_after_list; + } + } + rv = UvFsSyncDir(uv->dir, errmsg); + if (rv != 0) { + tracef("sync data directory: %s", errmsg); + rv = RAFT_IOERR; + goto err_after_list; + } + + RaftHeapFree(segments); + truncate->status = 0; + + tracef("uv truncate work cb ok"); + return; + +err_after_list: + RaftHeapFree(segments); +err: + assert(rv != 0); + truncate->status = rv; +} + +static void uvTruncateAfterWorkCb(uv_work_t *work, int status) +{ + assert(work != NULL); + struct uvTruncate *truncate = work->data; + assert(truncate != NULL); + struct uv *uv = truncate->uv; + assert(uv != NULL); + tracef("uv truncate after work cb status:%d", status); + assert(status == 0); + if (truncate->status != 0) { + uv->errored = true; + } + tracef("clear truncate work"); + uv->truncate_work.data = NULL; + RaftHeapFree(truncate); + UvUnblock(uv); +} + +static void uvTruncateBarrierCb(struct UvBarrierReq *barrier) +{ + struct uvTruncate *truncate = barrier->data; + struct uv *uv = truncate->uv; + tracef("uv truncate barrier cb"); + int rv; + + /* Ensure that we don't invoke this callback more than once. */ + barrier->cb = NULL; + + /* If we're closing, don't perform truncation at all and abort here. */ + if (uv->closing) { + tracef("closing => don't truncate"); + RaftHeapFree(truncate); + uvMaybeFireCloseCb(uv); + return; + } + + assert(QUEUE_IS_EMPTY(&uv->append_writing_reqs)); + assert(QUEUE_IS_EMPTY(&uv->finalize_reqs)); + assert(uv->finalize_work.data == NULL); + assert(uv->truncate_work.data == NULL); + + tracef("set truncate work"); + uv->truncate_work.data = truncate; + rv = uv_queue_work(uv->loop, &uv->truncate_work, uvTruncateWorkCb, + uvTruncateAfterWorkCb); + if (rv != 0) { + tracef("truncate index %lld: %s", truncate->index, + uv_strerror(rv)); + tracef("clear truncate work"); + uv->truncate_work.data = NULL; + uv->errored = true; + } +} + +int UvTruncate(struct raft_io *io, raft_index index) +{ + struct uv *uv; + struct uvTruncate *truncate; + int rv; + + uv = io->impl; + tracef("uv truncate %llu", index); + assert(!uv->closing); + + /* We should truncate only entries that we were requested to append in + * the first place. */ + assert(index > 0); + assert(index < uv->append_next_index); + + truncate = RaftHeapMalloc(sizeof *truncate); + if (truncate == NULL) { + rv = RAFT_NOMEM; + goto err; + } + truncate->uv = uv; + truncate->index = index; + truncate->barrier.data = truncate; + truncate->barrier.blocking = true; + truncate->barrier.cb = uvTruncateBarrierCb; + + /* Make sure that we wait for any inflight writes to finish and then + * close the current segment. */ + rv = UvBarrier(uv, index, &truncate->barrier); + if (rv != 0) { + goto err_after_req_alloc; + } + + return 0; + +err_after_req_alloc: + RaftHeapFree(truncate); +err: + assert(rv != 0); + return rv; +} + +#undef tracef diff --git a/src/raft/uv_work.c b/src/raft/uv_work.c new file mode 100644 index 000000000..5b6431b97 --- /dev/null +++ b/src/raft/uv_work.c @@ -0,0 +1,78 @@ +#include "assert.h" +#include "heap.h" +#include "uv.h" + +struct uvAsyncWork +{ + struct uv *uv; + struct raft_io_async_work *req; + struct uv_work_s work; + int status; + queue queue; +}; + +static void uvAsyncWorkCb(uv_work_t *work) +{ + struct uvAsyncWork *w = work->data; + assert(w != NULL); + int rv; + rv = w->req->work(w->req); + w->status = rv; +} + +static void uvAsyncAfterWorkCb(uv_work_t *work, int status) +{ + struct uvAsyncWork *w = work->data; + struct raft_io_async_work *req = w->req; + int req_status = w->status; + struct uv *uv = w->uv; + assert(status == 0); + + QUEUE_REMOVE(&w->queue); + RaftHeapFree(w); + req->cb(req, req_status); + uvMaybeFireCloseCb(uv); +} + +int UvAsyncWork(struct raft_io *io, + struct raft_io_async_work *req, + raft_io_async_work_cb cb) +{ + struct uv *uv; + struct uvAsyncWork *async_work; + int rv; + + uv = io->impl; + assert(!uv->closing); + + async_work = RaftHeapMalloc(sizeof *async_work); + if (async_work == NULL) { + rv = RAFT_NOMEM; + goto err; + } + + async_work->uv = uv; + async_work->req = req; + async_work->work.data = async_work; + req->cb = cb; + + QUEUE_PUSH(&uv->async_work_reqs, &async_work->queue); + rv = uv_queue_work(uv->loop, &async_work->work, uvAsyncWorkCb, + uvAsyncAfterWorkCb); + if (rv != 0) { + QUEUE_REMOVE(&async_work->queue); + tracef("async work: %s", uv_strerror(rv)); + rv = RAFT_IOERR; + goto err_after_req_alloc; + } + + return 0; + +err_after_req_alloc: + RaftHeapFree(async_work); +err: + assert(rv != 0); + return rv; +} + +#undef tracef diff --git a/src/raft/uv_writer.c b/src/raft/uv_writer.c new file mode 100644 index 000000000..b489765b6 --- /dev/null +++ b/src/raft/uv_writer.c @@ -0,0 +1,544 @@ +#include "uv_writer.h" + +#include +#include + +#include "../raft.h" +#include "assert.h" +#include "heap.h" + +/* Copy the error message from the request object to the writer object. */ +static void uvWriterReqTransferErrMsg(struct UvWriterReq *req) +{ + ErrMsgPrintf(req->writer->errmsg, "%s", req->errmsg); +} + +/* Set the request status according the given result code. */ +static void uvWriterReqSetStatus(struct UvWriterReq *req, int result) +{ + if (result < 0) { + ErrMsgPrintf(req->errmsg, "write failed: %d", result); + req->status = RAFT_IOERR; + } else if ((size_t)result < req->len) { + ErrMsgPrintf(req->errmsg, + "short write: %d bytes instead of %zu", result, + req->len); + req->status = RAFT_NOSPACE; + } else { + req->status = 0; + } +} + +/* Remove the request from the queue of inflight writes and invoke the request + * callback if set. */ +static void uvWriterReqFinish(struct UvWriterReq *req) +{ + QUEUE_REMOVE(&req->queue); + if (req->status != 0) { + uvWriterReqTransferErrMsg(req); + } + req->cb(req, req->status); +} + +/* Wrapper around the low-level OS syscall, providing a better error message. */ +static int uvWriterIoSetup(unsigned n, aio_context_t *ctx, char *errmsg) +{ + int rv; + rv = UvOsIoSetup(n, ctx); + if (rv != 0) { + switch (rv) { + case UV_EAGAIN: + ErrMsgPrintf(errmsg, + "AIO events user limit exceeded"); + rv = RAFT_TOOMANY; + break; + default: + UvOsErrMsg(errmsg, "io_setup", rv); + rv = RAFT_IOERR; + break; + } + return rv; + } + return 0; +} + +/* Run blocking syscalls involved in a file write request. + * + * Perform a KAIO write request and synchronously wait for it to complete. */ +static void uvWriterWorkCb(uv_work_t *work) +{ + struct UvWriterReq *req; /* Writer request object */ + struct UvWriter *w; /* Writer object */ + aio_context_t ctx; /* KAIO handle */ + struct iocb *iocbs; /* Pointer to KAIO request object */ + struct io_event event; /* KAIO response object */ + int n_events; + int rv; + + req = work->data; + w = req->writer; + + iocbs = &req->iocb; + + /* If more than one write in parallel is allowed, submit the AIO request + * using a dedicated context, to avoid synchronization issues between + * threads when multiple writes are submitted in parallel. This is + * suboptimal but in real-world users should use file systems and + * kernels with proper async write support. */ + if (w->n_events > 1) { + ctx = 0; + rv = uvWriterIoSetup(1 /* Maximum concurrent requests */, &ctx, + req->errmsg); + if (rv != 0) { + goto out; + } + } else { + ctx = w->ctx; + } + + /* Submit the request */ + rv = UvOsIoSubmit(ctx, 1, &iocbs); + if (rv != 0) { + /* UNTESTED: since we're not using NOWAIT and the parameters are + * valid, this shouldn't fail. */ + UvOsErrMsg(req->errmsg, "io_submit", rv); + rv = RAFT_IOERR; + goto out_after_io_setup; + } + + /* Wait for the request to complete */ + n_events = UvOsIoGetevents(ctx, 1, 1, &event, NULL); + assert(n_events == 1); + if (n_events != 1) { + /* UNTESTED */ + rv = n_events >= 0 ? -1 : n_events; + } + +out_after_io_setup: + if (w->n_events > 1) { + UvOsIoDestroy(ctx); + } + +out: + if (rv != 0) { + req->status = rv; + } else { + uvWriterReqSetStatus(req, (int)event.res); + } + + return; +} + +/* Callback run after writeWorkCb has returned. It normally invokes the write + * request callback. */ +static void uvWriterAfterWorkCb(uv_work_t *work, int status) +{ + struct UvWriterReq *req = work->data; /* Write file request object */ + assert(status == 0); /* We don't cancel worker requests */ + uvWriterReqFinish(req); +} + +/* Callback fired when the event fd associated with AIO write requests should be + * ready for reading (i.e. when a write has completed). */ +static void uvWriterPollCb(uv_poll_t *poller, int status, int events) +{ + struct UvWriter *w = poller->data; + uint64_t completed; /* True if the write is complete */ + unsigned i; + int n_events; + int rv; + + assert(w->event_fd >= 0); + assert(status == 0); + if (status != 0) { + /* UNTESTED libuv docs: If an error happens while polling, + * status will be < 0 and corresponds with one of the UV_E* + * error codes. */ + goto fail_requests; + } + + assert(events & UV_READABLE); + + /* Read the event file descriptor */ + rv = (int)read(w->event_fd, &completed, sizeof completed); + if (rv != sizeof completed) { + /* UNTESTED: According to eventfd(2) this is the only possible + * failure mode, meaning that epoll has indicated that the event + * FD is not yet ready. */ + assert(errno == EAGAIN); + return; + } + + /* TODO: this assertion fails in unit tests */ + /* assert(completed == 1); */ + + /* Try to fetch the write responses. + * + * If we got here at least one write should have completed and io_events + * should return immediately without blocking. */ + n_events = + UvOsIoGetevents(w->ctx, 1, (long int)w->n_events, w->events, NULL); + assert(n_events >= 1); + if (n_events < 1) { + /* UNTESTED */ + status = n_events == 0 ? -1 : n_events; + goto fail_requests; + } + + for (i = 0; i < (unsigned)n_events; i++) { + struct io_event *event = &w->events[i]; + struct UvWriterReq *req = *((void **)&event->data); + + /* If we got EAGAIN, it means it was not possible to perform the + * write asynchronously, so let's fall back to the threadpool. + */ + if (event->res == -EAGAIN) { + req->iocb.aio_flags &= (unsigned)~IOCB_FLAG_RESFD; + req->iocb.aio_resfd = 0; + req->iocb.aio_rw_flags &= ~RWF_NOWAIT; + assert(req->work.data == NULL); + req->work.data = req; + rv = uv_queue_work(w->loop, &req->work, uvWriterWorkCb, + uvWriterAfterWorkCb); + if (rv != 0) { + /* UNTESTED: with the current libuv + * implementation this should never fail. */ + UvOsErrMsg(req->errmsg, "uv_queue_work", rv); + req->status = RAFT_IOERR; + goto finish; + } + return; + } + + uvWriterReqSetStatus(req, (int)event->res); + + finish: + uvWriterReqFinish(req); + } + + return; + +fail_requests: + while (!QUEUE_IS_EMPTY(&w->poll_queue)) { + queue *head; + struct UvWriterReq *req; + head = QUEUE_HEAD(&w->poll_queue); + req = QUEUE_DATA(head, struct UvWriterReq, queue); + uvWriterReqSetStatus(req, status); + uvWriterReqFinish(req); + } +} + +int UvWriterInit(struct UvWriter *w, + struct uv_loop_s *loop, + uv_file fd, + bool direct /* Whether to use direct I/O */, + bool async /* Whether async I/O is available */, + unsigned max_concurrent_writes, + char *errmsg) +{ + void *data = w->data; + int rv = 0; + memset(w, 0, sizeof *w); + w->data = data; + w->loop = loop; + w->fd = fd; + w->async = async; + w->ctx = 0; + w->events = NULL; + w->n_events = max_concurrent_writes; + w->event_fd = -1; + w->event_poller.data = NULL; + w->check.data = NULL; + w->close_cb = NULL; + QUEUE_INIT(&w->poll_queue); + QUEUE_INIT(&w->work_queue); + w->closing = false; + w->errmsg = errmsg; + + /* Set direct I/O if available. */ + if (direct) { + rv = UvOsSetDirectIo(w->fd); + if (rv != 0) { + UvOsErrMsg(errmsg, "fcntl", rv); + goto err; + } + } + + /* Setup the AIO context. */ + rv = uvWriterIoSetup(w->n_events, &w->ctx, errmsg); + if (rv != 0) { + goto err; + } + + /* Initialize the array of re-usable event objects. */ + w->events = RaftHeapCalloc(w->n_events, sizeof *w->events); + if (w->events == NULL) { + /* UNTESTED: todo */ + ErrMsgOom(errmsg); + rv = RAFT_NOMEM; + goto err_after_io_setup; + } + + /* Create an event file descriptor to get notified when a write has + * completed. */ + rv = UvOsEventfd(0, UV_FS_O_NONBLOCK); + if (rv < 0) { + /* UNTESTED: should fail only with ENOMEM */ + UvOsErrMsg(errmsg, "eventfd", rv); + rv = RAFT_IOERR; + goto err_after_events_alloc; + } + w->event_fd = rv; + + rv = uv_poll_init(loop, &w->event_poller, w->event_fd); + if (rv != 0) { + /* UNTESTED: with the current libuv implementation this should + * never fail. */ + UvOsErrMsg(errmsg, "uv_poll_init", rv); + rv = RAFT_IOERR; + goto err_after_event_fd; + } + w->event_poller.data = w; + + rv = uv_check_init(loop, &w->check); + if (rv != 0) { + /* UNTESTED: with the current libuv implementation this should + * never fail. */ + UvOsErrMsg(errmsg, "uv_check_init", rv); + rv = RAFT_IOERR; + goto err_after_event_fd; + } + w->check.data = w; + + rv = uv_poll_start(&w->event_poller, UV_READABLE, uvWriterPollCb); + if (rv != 0) { + /* UNTESTED: with the current libuv implementation this should + * never fail. */ + UvOsErrMsg(errmsg, "uv_poll_start", rv); + rv = RAFT_IOERR; + goto err_after_event_fd; + } + + return 0; + +err_after_event_fd: + UvOsClose(w->event_fd); +err_after_events_alloc: + RaftHeapFree(w->events); +err_after_io_setup: + UvOsIoDestroy(w->ctx); +err: + assert(rv != 0); + return rv; +} + +static void uvWriterCleanUpAndFireCloseCb(struct UvWriter *w) +{ + assert(w->closing); + + UvOsClose(w->fd); + RaftHeapFree(w->events); + UvOsIoDestroy(w->ctx); + + if (w->close_cb != NULL) { + w->close_cb(w); + } +} + +static void uvWriterPollerCloseCb(struct uv_handle_s *handle) +{ + struct UvWriter *w = handle->data; + w->event_poller.data = NULL; + + /* Cancel all pending requests. */ + while (!QUEUE_IS_EMPTY(&w->poll_queue)) { + queue *head; + struct UvWriterReq *req; + head = QUEUE_HEAD(&w->poll_queue); + req = QUEUE_DATA(head, struct UvWriterReq, queue); + assert(req->work.data == NULL); + req->status = RAFT_CANCELED; + uvWriterReqFinish(req); + } + + if (w->check.data != NULL) { + return; + } + + uvWriterCleanUpAndFireCloseCb(w); +} + +static void uvWriterCheckCloseCb(struct uv_handle_s *handle) +{ + struct UvWriter *w = handle->data; + w->check.data = NULL; + if (w->event_poller.data != NULL) { + return; + } + uvWriterCleanUpAndFireCloseCb(w); +} + +static void uvWriterCheckCb(struct uv_check_s *check) +{ + struct UvWriter *w = check->data; + if (!QUEUE_IS_EMPTY(&w->work_queue)) { + return; + } + uv_close((struct uv_handle_s *)&w->check, uvWriterCheckCloseCb); +} + +void UvWriterClose(struct UvWriter *w, UvWriterCloseCb cb) +{ + int rv; + assert(!w->closing); + w->closing = true; + w->close_cb = cb; + + /* We can close the event file descriptor right away, but we shouldn't + * close the main file descriptor or destroy the AIO context since there + * might be threadpool requests in flight. */ + UvOsClose(w->event_fd); + + rv = uv_poll_stop(&w->event_poller); + assert(rv == 0); /* Can this ever fail? */ + + uv_close((struct uv_handle_s *)&w->event_poller, uvWriterPollerCloseCb); + + /* If we have requests executing in the threadpool, we need to wait for + * them. That's done in the check callback. */ + if (!QUEUE_IS_EMPTY(&w->work_queue)) { + uv_check_start(&w->check, uvWriterCheckCb); + } else { + uv_close((struct uv_handle_s *)&w->check, uvWriterCheckCloseCb); + } +} + +/* Return the total lengths of the given buffers. */ +static size_t lenOfBufs(const uv_buf_t bufs[], unsigned n) +{ + size_t len = 0; + unsigned i; + for (i = 0; i < n; i++) { + len += bufs[i].len; + } + return len; +} + +int UvWriterSubmit(struct UvWriter *w, + struct UvWriterReq *req, + const uv_buf_t bufs[], + unsigned n, + size_t offset, + UvWriterReqCb cb) +{ + int rv = 0; + struct iocb *iocbs = &req->iocb; + assert(!w->closing); + + /* TODO: at the moment we are not leveraging the support for concurrent + * writes, so ensure that we're getting write requests + * sequentially. */ + if (w->n_events == 1) { + assert(QUEUE_IS_EMPTY(&w->poll_queue)); + assert(QUEUE_IS_EMPTY(&w->work_queue)); + } + + assert(w->fd >= 0); + assert(w->event_fd >= 0); + assert(w->ctx != 0); + assert(req != NULL); + assert(bufs != NULL); + assert(n > 0); + + req->writer = w; + req->len = lenOfBufs(bufs, n); + req->status = -1; + req->work.data = NULL; + req->cb = cb; + memset(&req->iocb, 0, sizeof req->iocb); + memset(req->errmsg, 0, sizeof req->errmsg); + + req->iocb.aio_fildes = (uint32_t)w->fd; + req->iocb.aio_lio_opcode = IOCB_CMD_PWRITEV; + req->iocb.aio_reqprio = 0; + *((void **)(&req->iocb.aio_buf)) = (void *)bufs; + req->iocb.aio_nbytes = n; + req->iocb.aio_offset = (int64_t)offset; + *((void **)(&req->iocb.aio_data)) = (void *)req; + +#if defined(RWF_HIPRI) + /* High priority request, if possible */ + /* TODO: do proper kernel feature detection for this one. */ + /* req->iocb.aio_rw_flags |= RWF_HIPRI; */ +#endif + +#if defined(RWF_DSYNC) + /* Use per-request synchronous I/O if available. Otherwise, we have + * opened the file with O_DSYNC. */ + /* TODO: do proper kernel feature detection for this one. */ + /* req->iocb.aio_rw_flags |= RWF_DSYNC; */ +#endif + + /* If io_submit can be run in a 100% non-blocking way, we'll try to + * write without using the threadpool. */ + if (w->async) { + req->iocb.aio_flags |= IOCB_FLAG_RESFD; + req->iocb.aio_resfd = (uint32_t)w->event_fd; + req->iocb.aio_rw_flags |= RWF_NOWAIT; + } + + /* Try to submit the write request asynchronously */ + if (w->async) { + QUEUE_PUSH(&w->poll_queue, &req->queue); + rv = UvOsIoSubmit(w->ctx, 1, &iocbs); + + /* If no error occurred, we're done, the write request was + * submitted. */ + if (rv == 0) { + goto done; + } + + QUEUE_REMOVE(&req->queue); + + /* Check the reason of the error. */ + switch (rv) { + case UV_EAGAIN: + break; + default: + /* Unexpected error */ + UvOsErrMsg(w->errmsg, "io_submit", rv); + rv = RAFT_IOERR; + goto err; + } + + /* Submitting the write would block, or NOWAIT is not + * supported. Let's run this request in the threadpool. */ + req->iocb.aio_flags &= (unsigned)~IOCB_FLAG_RESFD; + req->iocb.aio_resfd = 0; + req->iocb.aio_rw_flags &= ~RWF_NOWAIT; + } + + /* If we got here it means we need to run io_submit in the threadpool. + */ + QUEUE_PUSH(&w->work_queue, &req->queue); + req->work.data = req; + rv = uv_queue_work(w->loop, &req->work, uvWriterWorkCb, + uvWriterAfterWorkCb); + if (rv != 0) { + /* UNTESTED: with the current libuv implementation this can't + * fail. */ + req->work.data = NULL; + QUEUE_REMOVE(&req->queue); + UvOsErrMsg(w->errmsg, "uv_queue_work", rv); + rv = RAFT_IOERR; + goto err; + } + +done: + return 0; + +err: + assert(rv != 0); + return rv; +} diff --git a/src/raft/uv_writer.h b/src/raft/uv_writer.h new file mode 100644 index 000000000..db8f5c293 --- /dev/null +++ b/src/raft/uv_writer.h @@ -0,0 +1,78 @@ +/* Asynchronous API to write a file. */ + +#ifndef UV_WRITER_H_ +#define UV_WRITER_H_ + +#include + +#include "err.h" +#include "queue.h" +#include "uv_os.h" + +/* Perform asynchronous writes to a single file. */ +struct UvWriter; + +/* Callback called after the memory associated with a file handle can be + * released. */ +typedef void (*UvWriterCloseCb)(struct UvWriter *w); + +struct UvWriter +{ + void *data; /* User data */ + struct uv_loop_s *loop; /* Event loop */ + uv_file fd; /* File handle */ + bool async; /* Whether fully async I/O is supported */ + aio_context_t ctx; /* KAIO handle */ + struct io_event *events; /* Array of KAIO response objects */ + unsigned n_events; /* Length of the events array */ + int event_fd; /* Poll'ed to check if write is finished */ + struct uv_poll_s + event_poller; /* Poll event_fd for completed poll requests */ + struct uv_check_s check; /* Check for completed threadpool requests */ + UvWriterCloseCb close_cb; /* Close callback */ + queue poll_queue; /* Pollable write requests */ + queue work_queue; /* Threadpool write requests */ + bool closing; /* Whether we're closing or closed */ + char *errmsg; /* Description of last error */ +}; + +/* Initialize a file writer. */ +int UvWriterInit(struct UvWriter *w, + struct uv_loop_s *loop, + uv_file fd, + bool direct /* Whether to use direct I/O */, + bool async /* Whether async I/O is available */, + unsigned max_concurrent_writes, + char *errmsg); + +/* Close the given file and release all associated resources. */ +void UvWriterClose(struct UvWriter *w, UvWriterCloseCb cb); + +/* Write request. */ +struct UvWriterReq; + +/* Callback called after a write request has been completed. */ +typedef void (*UvWriterReqCb)(struct UvWriterReq *req, int status); + +struct UvWriterReq +{ + void *data; /* User data */ + struct UvWriter *writer; /* Originating writer */ + size_t len; /* Total number of bytes to write */ + int status; /* Request result code */ + struct uv_work_s work; /* To execute logic in the threadpool */ + UvWriterReqCb cb; /* Callback to invoke upon request completion */ + struct iocb iocb; /* KAIO request (for writing) */ + char errmsg[256]; /* Error description (for thread-safety) */ + queue queue; /* Prev/next links in the inflight queue */ +}; + +/* Asynchronously write data to the underlying file. */ +int UvWriterSubmit(struct UvWriter *w, + struct UvWriterReq *req, + const uv_buf_t bufs[], + unsigned n, + size_t offset, + UvWriterReqCb cb); + +#endif /* UV_WRITER_H_ */ diff --git a/src/roles.c b/src/roles.c index 8993b5e19..21fb6bc5f 100644 --- a/src/roles.c +++ b/src/roles.c @@ -1,9 +1,8 @@ #include -#include - #include "client/protocol.h" #include "lib/queue.h" +#include "raft.h" #include "roles.h" #include "server.h" #include "translate.h" diff --git a/src/server.h b/src/server.h index c5509e3e6..f6678323a 100644 --- a/src/server.h +++ b/src/server.h @@ -1,8 +1,6 @@ #ifndef DQLITE_SERVER_H #define DQLITE_SERVER_H -#include -#include #include #include @@ -12,6 +10,7 @@ #include "id.h" #include "lib/assert.h" #include "logger.h" +#include "raft.h" #include "registry.h" #define DQLITE_ERRMSG_BUF_SIZE 300 diff --git a/src/translate.c b/src/translate.c index 673cc3c20..32938b414 100644 --- a/src/translate.c +++ b/src/translate.c @@ -1,10 +1,9 @@ #include "translate.h" -#include - #include "assert.h" #include "leader.h" #include "protocol.h" +#include "raft.h" /* Translate a raft error to a dqlite one. */ int translateRaftErrCode(int code) diff --git a/src/transport.c b/src/transport.c index 0607ec8e9..98750a74f 100644 --- a/src/transport.c +++ b/src/transport.c @@ -1,6 +1,5 @@ #include "lib/transport.h" -#include #include #include #include @@ -9,6 +8,7 @@ #include "lib/addr.h" #include "message.h" #include "protocol.h" +#include "raft.h" #include "request.h" #include "tracing.h" #include "transport.h" diff --git a/src/transport.h b/src/transport.h index 37d3629ac..5d1d3800f 100644 --- a/src/transport.h +++ b/src/transport.h @@ -9,7 +9,7 @@ #ifndef TRANSPORT_H_ #define TRANSPORT_H_ -#include +#include "raft.h" #include "../include/dqlite.h" diff --git a/src/vfs.c b/src/vfs.c index 418a60a5f..de247ca07 100644 --- a/src/vfs.c +++ b/src/vfs.c @@ -9,8 +9,6 @@ #include #include -#include - #include #include "../include/dqlite.h" @@ -19,6 +17,7 @@ #include "lib/byte.h" #include "format.h" +#include "raft.h" #include "tracing.h" #include "vfs.h" diff --git a/test/integration/test_vfs.c b/test/integration/test_vfs.c index ea4c4d207..3335c228c 100644 --- a/test/integration/test_vfs.c +++ b/test/integration/test_vfs.c @@ -1,4 +1,3 @@ -#include #include #include "../lib/fs.h" @@ -7,6 +6,7 @@ #include "../lib/sqlite.h" #include "../../include/dqlite.h" +#include "../../src/raft.h" #include diff --git a/test/lib/cluster.h b/test/lib/cluster.h index 760bd8d40..fdc9f1988 100644 --- a/test/lib/cluster.h +++ b/test/lib/cluster.h @@ -15,11 +15,9 @@ #ifndef TEST_CLUSTER_H #define TEST_CLUSTER_H -#include -#include - #include "../../src/config.h" #include "../../src/fsm.h" +#include "../../src/raft.h" #include "../../src/registry.h" #include "../../src/vfs.h" diff --git a/test/lib/raft.h b/test/lib/raft.h index fed669b5d..a36cfc461 100644 --- a/test/lib/raft.h +++ b/test/lib/raft.h @@ -5,11 +5,10 @@ #ifndef TEST_RAFT_H #define TEST_RAFT_H -#include -#include #include #include "../../src/fsm.h" +#include "../../src/raft.h" #include "../../src/transport.h" #include "fs.h" #include "logger.h" diff --git a/test/lib/raft_heap.c b/test/lib/raft_heap.c index 524529fe8..04b4dc92a 100644 --- a/test/lib/raft_heap.c +++ b/test/lib/raft_heap.c @@ -1,4 +1,4 @@ -#include +#include "../../src/raft.h" #include "fault.h" #include "raft_heap.h" diff --git a/test/raft/fuzzy/main_core.c b/test/raft/fuzzy/main_core.c new file mode 100644 index 000000000..807f4a72d --- /dev/null +++ b/test/raft/fuzzy/main_core.c @@ -0,0 +1,11 @@ +#include "../lib/runner.h" + +MunitSuite _main_suites[64]; +int _main_suites_n = 0; + +/* Test runner executable */ +int main(int argc, char *argv[MUNIT_ARRAY_PARAM(argc)]) +{ + MunitSuite suite = {(char *)"", NULL, _main_suites, 1, 0}; + return munit_suite_main(&suite, (void *)"unit", argc, argv); +} diff --git a/test/raft/fuzzy/test_election.c b/test/raft/fuzzy/test_election.c new file mode 100644 index 000000000..de6b0340b --- /dev/null +++ b/test/raft/fuzzy/test_election.c @@ -0,0 +1,103 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +static char *cluster_n[] = {"3", "4", "5", "7", NULL}; +static char *cluster_pre_vote[] = {"0", "1", NULL}; + +static MunitParameterEnum _params[] = { + {CLUSTER_N_PARAM, cluster_n}, + {CLUSTER_PRE_VOTE_PARAM, cluster_pre_vote}, + {NULL, NULL}, +}; + +static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(0); + CLUSTER_BOOTSTRAP; + CLUSTER_RANDOMIZE; + CLUSTER_START; + return f; +} + +static void tear_down(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Tests + * + *****************************************************************************/ + +SUITE(election) + +/* A leader is eventually elected */ +TEST(election, win, setup, tear_down, 0, _params) +{ + struct fixture *f = data; + CLUSTER_STEP_UNTIL_HAS_LEADER(10000); + return MUNIT_OK; +} + +/* A new leader is elected if the current one dies. */ +TEST(election, change, setup, tear_down, 0, _params) +{ + struct fixture *f = data; + CLUSTER_STEP_UNTIL_HAS_LEADER(10000); + CLUSTER_KILL_LEADER; + CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000); + CLUSTER_STEP_UNTIL_HAS_LEADER(20000); + return MUNIT_OK; +} + +/* A new leader is elected if the current one dies and a previously killed + * server with an outdated log and outdated term is revived. */ +TEST(election, changeReviveOutdated, setup, tear_down, 0, _params) +{ + struct fixture *f = data; + unsigned i; + + /* Kill a random server */ + i = ((unsigned)rand()) % CLUSTER_N; + CLUSTER_KILL(i); + + /* Server i's term will be lower than the term of the election. */ + CLUSTER_STEP_UNTIL_HAS_LEADER(20000); + + /* Add some entries to the log */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_KILL_LEADER; + CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000); + + /* Revive server i with an outdated log and term, the cluster + * should be able to elect a new leader */ + CLUSTER_REVIVE(i); + CLUSTER_STEP_UNTIL_HAS_LEADER(20000); + return MUNIT_OK; +} + +/* If no majority of servers is online, no leader is elected. */ +TEST(election, noQuorum, setup, tear_down, 0, _params) +{ + struct fixture *f = data; + CLUSTER_KILL_MAJORITY; + CLUSTER_STEP_UNTIL_ELAPSED(30000); + munit_assert_false(CLUSTER_HAS_LEADER); + return MUNIT_OK; +} diff --git a/test/raft/fuzzy/test_liveness.c b/test/raft/fuzzy/test_liveness.c new file mode 100644 index 000000000..98bfe0fd8 --- /dev/null +++ b/test/raft/fuzzy/test_liveness.c @@ -0,0 +1,154 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +/* Maximum number of cluster loop iterations each test should perform. */ +#define MAX_ITERATIONS 25000 + +/* Maximum number of cluster loop iterations a pair of servers should stay + * disconnected. */ +#define MAX_DISCONNECT 150 + +struct disconnection +{ + unsigned id1; + unsigned id2; + int start; + int duration; +}; + +struct fixture +{ + FIXTURE_CLUSTER; + struct disconnection *disconnections; +}; + +static char *cluster_n[] = {"3", "4", NULL}; +static char *cluster_pre_vote[] = {"0", "1", NULL}; + +static MunitParameterEnum _params[] = { + {CLUSTER_N_PARAM, cluster_n}, + {CLUSTER_PRE_VOTE_PARAM, cluster_pre_vote}, + {NULL, NULL}, +}; + +/* Return the number of distinct server pairs in the cluster. */ +static int __server_pairs(struct fixture *f) +{ + return CLUSTER_N * (CLUSTER_N - 1) / 2; +} + +/* Update the cluster connectivity for the given iteration. */ +static void __update_connectivity(struct fixture *f, int i) +{ + int p; + int pairs = __server_pairs(f); + + for (p = 0; p < pairs; p++) { + struct disconnection *disconnection = &f->disconnections[p]; + unsigned id1 = disconnection->id1; + unsigned id2 = disconnection->id2; + + if (disconnection->start == 0) { + /* Decide whether to disconnect this pair. */ + if (munit_rand_int_range(1, 10) <= 1) { + disconnection->start = i; + disconnection->duration = + munit_rand_int_range(50, MAX_DISCONNECT); + raft_fixture_saturate(&f->cluster, id1 - 1, id2 - 1); + raft_fixture_saturate(&f->cluster, id2 - 1, id1 - 1); + } + } else { + /* Decide whether to reconnect this pair. */ + if (i - disconnection->start > disconnection->duration) { + raft_fixture_desaturate(&f->cluster, id1 - 1, id2 - 1); + raft_fixture_desaturate(&f->cluster, id2 - 1, id1 - 1); + disconnection->start = 0; + } + } + } +} + +static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + int pairs; + size_t i, j, k; + SETUP_CLUSTER(0); + CLUSTER_BOOTSTRAP; + CLUSTER_RANDOMIZE; + CLUSTER_START; + + /* Number of distinct pairs of servers. */ + pairs = __server_pairs(f); + + f->disconnections = munit_malloc(pairs * sizeof *f->disconnections); + + k = 0; + for (i = 0; i < CLUSTER_N; i++) { + for (j = i + 1; j < CLUSTER_N; j++) { + struct disconnection *disconnection = &f->disconnections[k]; + disconnection->id1 = i + 1; + disconnection->id2 = j + 1; + disconnection->start = 0; + disconnection->duration = 0; + k++; + } + } + + return f; +} + +static void tear_down(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f->disconnections); + free(f); +} + +/****************************************************************************** + * + * Tests + * + *****************************************************************************/ + +SUITE(liveness) + +static void apply_cb(struct raft_apply *req, int status, void *result) +{ + (void)status; + (void)result; + free(req); +} + +/* The system makes progress even in case of network disruptions. */ +TEST(liveness, networkDisconnect, setup, tear_down, 0, _params) +{ + struct fixture *f = data; + int i = 0; + + (void)params; + + for (i = 0; i < MAX_ITERATIONS; i++) { + __update_connectivity(f, i); + raft_fixture_step(&f->cluster); + + if (CLUSTER_LEADER != CLUSTER_N) { + struct raft_apply *req = munit_malloc(sizeof *req); + CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req, 1, apply_cb); + if (CLUSTER_LAST_APPLIED(CLUSTER_LEADER) >= 2) { + break; + } + } + } + + // munit_assert_int(CLUSTER_LAST_APPLIED(CLUSTER_LEADER), >=, 2); + + return MUNIT_OK; +} diff --git a/test/raft/fuzzy/test_membership.c b/test/raft/fuzzy/test_membership.c new file mode 100644 index 000000000..00b3e9205 --- /dev/null +++ b/test/raft/fuzzy/test_membership.c @@ -0,0 +1,113 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; + struct raft_change req; +}; + +static char *cluster_n[] = {"3", "4", "5", NULL}; + +static MunitParameterEnum _params[] = { + {CLUSTER_N_PARAM, cluster_n}, + {NULL, NULL}, +}; + +static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(0); + CLUSTER_BOOTSTRAP; + CLUSTER_RANDOMIZE; + CLUSTER_START; + CLUSTER_STEP_UNTIL_HAS_LEADER(10000); + return f; +} + +static void tear_down(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Tests + * + *****************************************************************************/ + +SUITE(membership) + +TEST(membership, addNonVoting, setup, tear_down, 0, _params) +{ + struct fixture *f = data; + const struct raft_server *server; + struct raft *raft; + + CLUSTER_ADD(&f->req); + CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 3, 2000); + + /* Then promote it. */ + CLUSTER_ASSIGN(&f->req, RAFT_STANDBY); + + CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 4, 2000); + + raft = CLUSTER_RAFT(CLUSTER_LEADER); + + server = &raft->configuration.servers[CLUSTER_N - 1]; + munit_assert_int(server->id, ==, CLUSTER_N); + + return MUNIT_OK; +} + +TEST(membership, addVoting, setup, tear_down, 0, _params) +{ + struct fixture *f = data; + const struct raft_server *server; + struct raft *raft; + + (void)params; + + CLUSTER_ADD(&f->req); + CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 3, 2000); + + /* Then promote it. */ + CLUSTER_ASSIGN(&f->req, RAFT_VOTER); + + CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 4, 2000); + + raft = CLUSTER_RAFT(CLUSTER_LEADER); + + server = &raft->configuration.servers[CLUSTER_N - 1]; + munit_assert_int(server->role, ==, RAFT_VOTER); + + return MUNIT_OK; +} + +TEST(membership, removeVoting, setup, tear_down, 0, _params) +{ + struct fixture *f = data; + struct raft *raft; + int rv; + + (void)params; + + raft = CLUSTER_RAFT(CLUSTER_LEADER); + + rv = raft_remove(raft, &f->req, CLUSTER_LEADER % CLUSTER_N + 1, NULL); + munit_assert_int(rv, ==, 0); + + CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 3, 2000); + + munit_assert_int(raft->configuration.n, ==, CLUSTER_N - 1); + + return 0; +} diff --git a/test/raft/fuzzy/test_replication.c b/test/raft/fuzzy/test_replication.c new file mode 100644 index 000000000..22821e00c --- /dev/null +++ b/test/raft/fuzzy/test_replication.c @@ -0,0 +1,175 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +static char *cluster_n[] = {"3", "5", "7", NULL}; + +static MunitParameterEnum _params[] = { + {CLUSTER_N_PARAM, cluster_n}, + {NULL, NULL}, +}; + +static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(0); + CLUSTER_BOOTSTRAP; + CLUSTER_RANDOMIZE; + CLUSTER_START; + CLUSTER_STEP_UNTIL_HAS_LEADER(10000); + return f; +} + +static void tear_down(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +#define APPLY_ADD_ONE(REQ) CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, REQ, 1, NULL) + +/****************************************************************************** + * + * Tests + * + *****************************************************************************/ + +SUITE(replication) + +/* New entries on the leader are eventually replicated to followers. */ +TEST(replication, appendEntries, setup, tear_down, 0, _params) +{ + struct fixture *f = data; + struct raft_apply *req = munit_malloc(sizeof *req); + (void)params; + APPLY_ADD_ONE(req); + CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 3, 2000); + free(req); + return MUNIT_OK; +} + +/* The cluster remains available even if the current leader dies and a new + * leader gets elected. */ +TEST(replication, availability, setup, tear_down, 0, _params) +{ + struct fixture *f = data; + struct raft_apply *req1 = munit_malloc(sizeof *req1); + struct raft_apply *req2 = munit_malloc(sizeof *req2); + + (void)params; + + APPLY_ADD_ONE(req1); + CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 3, 2000); + + CLUSTER_KILL_LEADER; + CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000); + CLUSTER_STEP_UNTIL_HAS_LEADER(10000); + + APPLY_ADD_ONE(req2); + /* Index 3 -> 5 = APPLY entry + BARRIER entry after becoming leader */ + CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 5, 2000); + + free(req1); + free(req2); + + return MUNIT_OK; +} + +static void apply_cb(struct raft_apply *req, int status, void *result) +{ + (void)status; + (void)result; + free(req); +} + +/* If no quorum is available, entries don't get committed. */ +TEST(replication, noQuorum, setup, tear_down, 0, _params) +{ + struct fixture *f = data; + struct raft_apply *req = munit_malloc(sizeof *req); + unsigned i; + + (void)params; + + CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req, 1, apply_cb); + CLUSTER_KILL_MAJORITY; + + CLUSTER_STEP_UNTIL_ELAPSED(10000); + + for (i = 0; i < CLUSTER_N; i++) { + munit_assert_int(CLUSTER_LAST_APPLIED(i), ==, 1); + } + + return MUNIT_OK; +} + +/* If the cluster is partitioned, entries don't get committed. */ +TEST(replication, partitioned, setup, tear_down, 0, _params) +{ + struct fixture *f = data; + struct raft_apply *req1 = munit_malloc(sizeof *req1); + struct raft_apply *req2 = munit_malloc(sizeof *req2); + unsigned leader_id; + size_t i; + size_t n; + + (void)params; + + leader_id = CLUSTER_LEADER + 1; + + /* Disconnect the leader from a majority of servers */ + n = 0; + for (i = 0; n < (CLUSTER_N / 2) + 1; i++) { + struct raft *raft = CLUSTER_RAFT(i); + if (raft->id == leader_id) { + continue; + } + raft_fixture_saturate(&f->cluster, leader_id - 1, raft->id - 1); + raft_fixture_saturate(&f->cluster, raft->id - 1, leader_id - 1); + n++; + } + + /* Try to append a new entry using the disconnected leader. */ + CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req1, 1, apply_cb); + + /* The leader gets deposed. */ + CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000); + + /* The entry does not get committed. */ + CLUSTER_STEP_UNTIL_ELAPSED(5000); + + /* Reconnect the old leader */ + for (i = 0; i < CLUSTER_N; i++) { + struct raft *raft = CLUSTER_RAFT(i); + if (raft->id == leader_id) { + continue; + } + raft_fixture_desaturate(&f->cluster, leader_id - 1, raft->id - 1); + } + + // TODO this fails with seed 0x3914306f + CLUSTER_STEP_UNTIL_HAS_LEADER(30000); + + /* Re-try now to append the entry. */ + CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req2, 1, apply_cb); + CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 2, 10000); + + return MUNIT_OK; +} diff --git a/test/raft/integration/append_helpers.h b/test/raft/integration/append_helpers.h new file mode 100644 index 000000000..59c1bbf38 --- /dev/null +++ b/test/raft/integration/append_helpers.h @@ -0,0 +1,102 @@ +#include "../../../src/raft/uv.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +struct result +{ + int status; + bool done; + void *data; +}; + +static void appendCbAssertResult(struct raft_io_append *req, int status) +{ + struct result *result = req->data; + munit_assert_int(status, ==, result->status); + result->done = true; +} + +/* Declare and fill the entries array for the append request identified by + * I. The array will have N entries, and each entry will have a data buffer of + * SIZE bytes.*/ +#define ENTRIES(I, N, SIZE) \ + struct raft_entry _entries##I[N]; \ + uint8_t _entries_data##I[N * SIZE]; \ + { \ + int _i; \ + for (_i = 0; _i < N; _i++) { \ + struct raft_entry *entry = &_entries##I[_i]; \ + entry->term = 1; \ + entry->type = RAFT_COMMAND; \ + entry->buf.base = &_entries_data##I[_i * SIZE]; \ + entry->buf.len = SIZE; \ + entry->batch = NULL; \ + munit_assert_ptr_not_null(entry->buf.base); \ + memset(entry->buf.base, 0, entry->buf.len); \ + uint64_t _temporary = f->count; \ + memcpy(entry->buf.base, &_temporary, 8); \ + f->count++; \ + } \ + } + +/* Submit an append request identified by I, with N_ENTRIES entries, each one of + * size ENTRY_SIZE. When the append request completes, CB will be called + * and DATA will be available in result->data. f->io.append is expected to + * return RV. */ +#define APPEND_SUBMIT_CB_DATA(I, N_ENTRIES, ENTRY_SIZE, CB, DATA, RV) \ + struct raft_io_append _req##I; \ + struct result _result##I = {0, false, DATA}; \ + int _rv##I; \ + ENTRIES(I, N_ENTRIES, ENTRY_SIZE); \ + _req##I.data = &_result##I; \ + _rv##I = f->io.append(&f->io, &_req##I, _entries##I, N_ENTRIES, CB); \ + munit_assert_int(_rv##I, ==, RV) + +/* Submit an append request identified by I, with N_ENTRIES entries, each one of + * size ENTRY_SIZE. The default expectation is for the operation to succeed. A + * custom STATUS can be set with APPEND_EXPECT. */ +#define APPEND_SUBMIT(I, N_ENTRIES, ENTRY_SIZE) \ + APPEND_SUBMIT_CB_DATA(I, N_ENTRIES, ENTRY_SIZE, appendCbAssertResult, \ + NULL, 0) + +/* Try to submit an append request and assert that the given error code and + * message are returned. */ +#define APPEND_ERROR(N_ENTRIES, ENTRY_SIZE, RV, ERRMSG) \ + do { \ + struct raft_io_append _req; \ + int _rv; \ + ENTRIES(0, N_ENTRIES, ENTRY_SIZE); \ + _rv = f->io.append(&f->io, &_req, _entries0, N_ENTRIES, NULL); \ + munit_assert_int(_rv, ==, RV); \ + /* munit_assert_string_equal(f->io.errmsg, ERRMSG);*/ \ + } while (0) + +#define APPEND_EXPECT(I, STATUS) _result##I.status = STATUS + +/* Wait for the append request identified by I to complete. */ +#define APPEND_WAIT(I) LOOP_RUN_UNTIL(&_result##I.done) + +/* Submit an append request with an entries array with N_ENTRIES entries, each + * one of size ENTRY_SIZE, and wait for the operation to successfully + * complete. */ +#define APPEND(N_ENTRIES, ENTRY_SIZE) \ + do { \ + APPEND_SUBMIT(0, N_ENTRIES, ENTRY_SIZE); \ + APPEND_WAIT(0); \ + } while (0) + +/* Submit an append request with the given parameters and wait for the operation + * to fail with the given code and message. */ +#define APPEND_FAILURE(N_ENTRIES, ENTRY_SIZE, STATUS, ERRMSG) \ + { \ + APPEND_SUBMIT(0, N_ENTRIES, ENTRY_SIZE); \ + APPEND_EXPECT(0, STATUS); \ + APPEND_WAIT(0); \ + f->count--; \ + munit_assert_string_equal(f->io.errmsg, ERRMSG); \ + } diff --git a/test/raft/integration/main_core.c b/test/raft/integration/main_core.c new file mode 100644 index 000000000..ad1798bba --- /dev/null +++ b/test/raft/integration/main_core.c @@ -0,0 +1,3 @@ +#include "../lib/runner.h" + +RUNNER("core") diff --git a/test/raft/integration/main_uv.c b/test/raft/integration/main_uv.c new file mode 100644 index 000000000..7f2eba543 --- /dev/null +++ b/test/raft/integration/main_uv.c @@ -0,0 +1,3 @@ +#include "../lib/runner.h" + +RUNNER("uv") diff --git a/test/raft/integration/test_apply.c b/test/raft/integration/test_apply.c new file mode 100644 index 000000000..650df5a93 --- /dev/null +++ b/test/raft/integration/test_apply.c @@ -0,0 +1,160 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(2); + CLUSTER_BOOTSTRAP; + CLUSTER_START; + CLUSTER_ELECT(0); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +struct result +{ + int status; + bool done; + raft_index prev_applied; + struct raft *raft; +}; + +static void applyCbAssertResult(struct raft_apply *req, int status, void *_) +{ + struct result *result = req->data; + (void)_; + munit_assert_int(status, ==, result->status); + if (status == 0) { + munit_assert_ulong(result->prev_applied, <, + raft_last_applied(result->raft)); + } + result->done = true; +} + +static bool applyCbHasFired(struct raft_fixture *f, void *arg) +{ + struct result *result = arg; + (void)f; + return result->done; +} + +/* Submit an apply request. */ +#define APPLY_SUBMIT(I, N) \ + struct raft_buffer _buf; \ + struct raft_apply _req; \ + struct raft *r = CLUSTER_RAFT(I); \ + struct result _result = {0, false, raft_last_applied(r), r}; \ + int _rv; \ + FsmEncodeSetX(N, &_buf); \ + _req.data = &_result; \ + _rv = raft_apply(CLUSTER_RAFT(I), &_req, &_buf, 1, applyCbAssertResult); \ + munit_assert_int(_rv, ==, 0); + +/* Expect the apply callback to fire with the given status. */ +#define APPLY_EXPECT(STATUS) _result.status = STATUS + +/* Wait until an apply request completes. */ +#define APPLY_WAIT CLUSTER_STEP_UNTIL(applyCbHasFired, &_result, 2000) + +/* Submit to the I'th server a request to apply a new RAFT_COMMAND entry and + * wait for the operation to succeed. */ +#define APPLY(I, N) \ + do { \ + APPLY_SUBMIT(I, N); \ + APPLY_WAIT; \ + } while (0) + +/* Submit to the I'th server a request to apply a new RAFT_COMMAND entry and + * assert that the given error is returned. */ +#define APPLY_ERROR(I, RV, ERRMSG) \ + do { \ + struct raft_buffer _buf; \ + struct raft_apply _req; \ + int _rv; \ + FsmEncodeSetX(123, &_buf); \ + _rv = raft_apply(CLUSTER_RAFT(I), &_req, &_buf, 1, NULL); \ + munit_assert_int(_rv, ==, RV); \ + munit_assert_string_equal(CLUSTER_ERRMSG(I), ERRMSG); \ + raft_free(_buf.base); \ + } while (0) + +/****************************************************************************** + * + * Success scenarios + * + *****************************************************************************/ + +SUITE(raft_apply) + +/* Append the very first command entry. */ +TEST(raft_apply, first, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + int val = 123; + APPLY(0, val); + munit_assert_int(FsmGetX(CLUSTER_FSM(0)), ==, val); + return MUNIT_OK; +} + +/* Append two command entries. */ +TEST(raft_apply, two, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + int val = 123; + APPLY(0, val); + munit_assert_int(FsmGetX(CLUSTER_FSM(0)), ==, val); + val = 124; + APPLY(0, val); + munit_assert_int(FsmGetX(CLUSTER_FSM(0)), ==, val); + return MUNIT_OK; +} + +/****************************************************************************** + * + * Failure scenarios + * + *****************************************************************************/ + +/* If the raft instance is not in leader state, an error is returned. */ +TEST(raft_apply, notLeader, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPLY_ERROR(1, RAFT_NOTLEADER, "server is not the leader"); + return MUNIT_OK; +} + +/* If the raft instance steps down from leader state, the apply callback fires + * with an error. */ +TEST(raft_apply, leadershipLost, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPLY_SUBMIT(0, 123); + APPLY_EXPECT(RAFT_LEADERSHIPLOST); + CLUSTER_DEPOSE; + APPLY_WAIT; + return MUNIT_OK; +} diff --git a/test/raft/integration/test_assign.c b/test/raft/integration/test_assign.c new file mode 100644 index 000000000..7404b3fe6 --- /dev/null +++ b/test/raft/integration/test_assign.c @@ -0,0 +1,457 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +struct result +{ + int status; + bool done; +}; + +/* Add a an empty server to the cluster and start it. */ +#define GROW \ + { \ + int rv__; \ + CLUSTER_GROW; \ + rv__ = raft_start(CLUSTER_RAFT(2)); \ + munit_assert_int(rv__, ==, 0); \ + } + +static void changeCbAssertResult(struct raft_change *req, int status) +{ + struct result *result = req->data; + munit_assert_int(status, ==, result->status); + result->done = true; +} + +static bool changeCbHasFired(struct raft_fixture *f, void *arg) +{ + struct result *result = arg; + (void)f; + return result->done; +} + +/* Submit an add request. */ +#define ADD_SUBMIT(I, ID) \ + struct raft_change _req; \ + char _address[16]; \ + struct result _result = {0, false}; \ + int _rv; \ + _req.data = &_result; \ + sprintf(_address, "%d", ID); \ + _rv = \ + raft_add(CLUSTER_RAFT(I), &_req, ID, _address, changeCbAssertResult); \ + munit_assert_int(_rv, ==, 0); + +#define ADD(I, ID) \ + do { \ + ADD_SUBMIT(I, ID); \ + CLUSTER_STEP_UNTIL(changeCbHasFired, &_result, 2000); \ + } while (0) + +/* Submit an assign role request. */ +#define ASSIGN_SUBMIT(I, ID, ROLE) \ + struct raft_change _req; \ + struct result _result = {0, false}; \ + int _rv; \ + _req.data = &_result; \ + _rv = raft_assign(CLUSTER_RAFT(I), &_req, ID, ROLE, changeCbAssertResult); \ + munit_assert_int(_rv, ==, 0); + +/* Expect the request callback to fire with the given status. */ +#define ASSIGN_EXPECT(STATUS) _result.status = STATUS; + +/* Wait until a promote request completes. */ +#define ASSIGN_WAIT CLUSTER_STEP_UNTIL(changeCbHasFired, &_result, 10000) + +/* Submit a request to assign the I'th server to the given role and wait for the + * operation to succeed. */ +#define ASSIGN(I, ID, ROLE) \ + do { \ + ASSIGN_SUBMIT(I, ID, ROLE); \ + ASSIGN_WAIT; \ + } while (0) + +/* Invoke raft_assign() against the I'th server and assert it the given error + * code. */ +#define ASSIGN_ERROR(I, ID, ROLE, RV, ERRMSG) \ + { \ + struct raft_change __req; \ + int __rv; \ + __rv = raft_assign(CLUSTER_RAFT(I), &__req, ID, ROLE, NULL); \ + munit_assert_int(__rv, ==, RV); \ + munit_assert_string_equal(ERRMSG, CLUSTER_ERRMSG(I)); \ + } + +/****************************************************************************** + * + * Set up a cluster of 2 servers, with the first as leader. + * + *****************************************************************************/ + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(2); + CLUSTER_BOOTSTRAP; + CLUSTER_START; + CLUSTER_ELECT(0); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Assertions + * + *****************************************************************************/ + +/* Assert the values of the committed and uncommitted configuration indexes on + * the raft instance with the given index. */ +#define ASSERT_CONFIGURATION_INDEXES(I, COMMITTED, UNCOMMITTED) \ + { \ + struct raft *raft_ = CLUSTER_RAFT(I); \ + munit_assert_int(raft_->configuration_committed_index, ==, COMMITTED); \ + munit_assert_int(raft_->configuration_uncommitted_index, ==, \ + UNCOMMITTED); \ + } + +/* Assert that the state of the current catch up round matches the given + * values. */ +#define ASSERT_CATCH_UP_ROUND(I, PROMOTEED_ID, NUMBER, DURATION) \ + { \ + struct raft *raft_ = CLUSTER_RAFT(I); \ + munit_assert_int(raft_->leader_state.promotee_id, ==, PROMOTEED_ID); \ + munit_assert_int(raft_->leader_state.round_number, ==, NUMBER); \ + munit_assert_int( \ + raft_->io->time(raft_->io) - raft_->leader_state.round_start, >=, \ + DURATION); \ + } + +/****************************************************************************** + * + * raft_assign + * + *****************************************************************************/ + +SUITE(raft_assign) + +/* Assigning the voter role to a spare server whose log is already up-to-date + * results in the relevant configuration change to be submitted immediately. */ +TEST(raft_assign, promoteUpToDate, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft; + const struct raft_server *server; + GROW; + ADD(0, 3); + CLUSTER_STEP_N(3); + + ASSIGN(0, 3, RAFT_VOTER); + + /* Server 3 is being considered as voting, even though the configuration + * change is not committed yet. */ + raft = CLUSTER_RAFT(0); + server = &raft->configuration.servers[2]; + munit_assert_int(server->role, ==, RAFT_VOTER); + + /* The configuration change request eventually succeeds. */ + CLUSTER_STEP_UNTIL_APPLIED(0, 3, 2000); + + return MUNIT_OK; +} + +static bool thirdServerHasCaughtUp(struct raft_fixture *f, void *arg) +{ + struct raft *raft = raft_fixture_get(f, 0); + (void)arg; + return raft->leader_state.promotee_id == 0; +} + +/* Assigning the voter role to a spare server whose log is not up-to-date + * results in catch-up rounds to start. When the server has caught up, the + * configuration change request gets submitted. */ +TEST(raft_assign, promoteCatchUp, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft; + const struct raft_server *server; + CLUSTER_MAKE_PROGRESS; + GROW; + ADD(0, 3); + + ASSIGN_SUBMIT(0, 3, RAFT_VOTER); + + /* Server 3 is not being considered as voting, since its log is behind. */ + raft = CLUSTER_RAFT(0); + server = &raft->configuration.servers[2]; + munit_assert_int(server->role, ==, RAFT_SPARE); + + /* Advance the match index of server 3, by acknowledging the AppendEntries + * request that the leader has sent to it. */ + CLUSTER_STEP_UNTIL_APPLIED(2, 3, 2000); + + /* Disconnect the second server, so it doesn't participate in the quorum */ + CLUSTER_SATURATE_BOTHWAYS(0, 1); + + /* Eventually the leader notices that the third server has caught. */ + CLUSTER_STEP_UNTIL(thirdServerHasCaughtUp, NULL, 2000); + + /* The leader has submitted a configuration change request, but it's + * uncommitted. */ + ASSERT_CONFIGURATION_INDEXES(0, 4, 5); + + /* The third server notifies that it has appended the new + * configuration. Since it's considered voting already, it counts for the + * majority and the entry gets committed. */ + CLUSTER_STEP_UNTIL_APPLIED(0, 5, 2000); + CLUSTER_STEP_UNTIL_APPLIED(2, 5, 2000); + + /* The promotion is completed. */ + ASSERT_CONFIGURATION_INDEXES(0, 5, 0); + + return MUNIT_OK; +} + +static bool thirdServerHasCompletedFirstRound(struct raft_fixture *f, void *arg) +{ + struct raft *raft = raft_fixture_get(f, 0); + (void)arg; + return raft->leader_state.round_number != 1; +} + +/* Assigning the voter role to a spare a server whose log is not up-to-date + * results in catch-up rounds to start. If new entries are appended after a + * round is started, a new round is initiated once the former one completes. */ +TEST(raft_assign, promoteNewRound, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + unsigned election_timeout = CLUSTER_RAFT(0)->election_timeout; + struct raft_apply *req = munit_malloc(sizeof *req); + CLUSTER_MAKE_PROGRESS; + GROW; + ADD(0, 3); + + ASSIGN_SUBMIT(0, 3, RAFT_VOTER); + ASSERT_CATCH_UP_ROUND(0, 3, 1, 0); + + /* Now that the catch-up round started, submit a new entry and set a very + * high latency on the server being promoted, so it won't deliver + * AppendEntry results within the round duration. */ + CLUSTER_APPLY_ADD_X(0, req, 1, NULL); + CLUSTER_STEP_UNTIL_ELAPSED(election_timeout + 100); + + // FIXME: unstable with 0xcf1f25b6 + // ASSERT_CATCH_UP_ROUND(0, 3, 1, election_timeout + 100); + + /* The leader eventually receives the AppendEntries result from the + * promotee, acknowledging all entries except the last one. The first round + * has completes and a new one has starts. */ + CLUSTER_STEP_UNTIL(thirdServerHasCompletedFirstRound, NULL, 2000); + + /* Eventually the server is promoted and everyone applies the entry. */ + CLUSTER_STEP_UNTIL_APPLIED(0, req->index, 5000); + + /* The promotion is eventually completed. */ + CLUSTER_STEP_UNTIL_APPLIED(0, req->index + 1, 5000); + ASSERT_CONFIGURATION_INDEXES(0, 6, 0); + + free(req); + + return MUNIT_SKIP; +} + +static bool secondServerHasNewConfiguration(struct raft_fixture *f, void *arg) +{ + struct raft *raft = raft_fixture_get(f, 1); + (void)arg; + return raft->configuration.servers[2].role == RAFT_VOTER; +} + +/* If a follower receives an AppendEntries RPC containing a RAFT_CHANGE entry + * which changes the role of a server, the configuration change is immediately + * applied locally, even if the entry is not yet committed. Once the entry is + * committed, the change becomes permanent.*/ +TEST(raft_assign, changeIsImmediate, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + GROW; + CLUSTER_MAKE_PROGRESS; + ADD(0, 3); + CLUSTER_STEP_UNTIL_APPLIED(1, 4, 2000); + + ASSIGN_SUBMIT(0, 3, RAFT_VOTER); + CLUSTER_STEP_UNTIL(secondServerHasNewConfiguration, NULL, 3000); + ASSERT_CONFIGURATION_INDEXES(1, 4, 5); + + ASSIGN_WAIT; + + return MUNIT_OK; +} + +/* Assign the stand-by role to an idle server. */ +TEST(raft_assign, promoteToStandBy, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + GROW; + ADD(0, 3); + ASSIGN(0, 3, RAFT_STANDBY); + return MUNIT_OK; +} + +/* Trying to promote a server on a raft instance which is not the leader results + * in an error. */ +TEST(raft_assign, notLeader, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ASSIGN_ERROR(1, 3, RAFT_VOTER, RAFT_NOTLEADER, "server is not the leader"); + return MUNIT_OK; +} + +/* Trying to change the role of a server whose ID is unknown results in an + * error. */ +TEST(raft_assign, unknownId, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ASSIGN_ERROR(0, 3, RAFT_VOTER, RAFT_NOTFOUND, "no server has ID 3"); + return MUNIT_OK; +} + +/* Trying to promote a server to an unknown role in an. */ +TEST(raft_assign, badRole, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ASSIGN_ERROR(0, 3, 999, RAFT_BADROLE, "server role is not valid"); + return MUNIT_OK; +} + +/* Trying to assign the voter role to a server which has already it results in + * an error. */ +TEST(raft_assign, alreadyHasRole, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ASSIGN_ERROR(0, 1, RAFT_VOTER, RAFT_BADROLE, "server is already voter"); + return MUNIT_OK; +} + +/* Trying to assign a new role to a server while a configuration change is in + * progress results in an error. */ +TEST(raft_assign, changeRequestAlreadyInProgress, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + GROW; + ADD(0, 3); + ASSIGN_SUBMIT(0, 3, RAFT_VOTER); + ASSIGN_ERROR(0, 3, RAFT_VOTER, RAFT_CANTCHANGE, + "a configuration change is already in progress"); + ASSIGN_WAIT; + return MUNIT_OK; +} + +/* If leadership is lost before the configuration change log entry for setting + * the new server role is committed, the leader configuration gets rolled back + * and the role of server being changed is reverted. */ +TEST(raft_assign, leadershipLost, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + const struct raft_server *server; + /* TODO: fix */ + return MUNIT_SKIP; + GROW; + ADD(0, 3); + CLUSTER_STEP_N(2); + + ASSIGN_SUBMIT(0, 3, RAFT_VOTER); + + /* Server 3 is being considered as voting, even though the configuration + * change is not committed yet. */ + ASSERT_CATCH_UP_ROUND(0, 0, 0, 0); + ASSERT_CONFIGURATION_INDEXES(0, 2, 3); + server = configurationGet(&CLUSTER_RAFT(0)->configuration, 3); + munit_assert_int(server->role, ==, RAFT_VOTER); + + /* Lose leadership. */ + CLUSTER_DEPOSE; + + /* A new leader gets elected */ + CLUSTER_ELECT(1); + CLUSTER_STEP_N(5); + + /* Server 3 is not being considered voting anymore. */ + server = configurationGet(&CLUSTER_RAFT(0)->configuration, 3); + munit_assert_int(server->role, ==, RAFT_STANDBY); + + return MUNIT_OK; +} + +/* Trying to assign the voter role to an unresponsive server eventually + * fails. */ +TEST(raft_assign, promoteUnresponsive, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_MAKE_PROGRESS; + GROW; + ADD(0, 3); + + ASSIGN_SUBMIT(0, 3, RAFT_VOTER); + CLUSTER_KILL(2); + + ASSIGN_EXPECT(RAFT_NOCONNECTION); + ASSIGN_WAIT; + + return MUNIT_OK; +} + +/* Demote a voter node to stand-by. */ +TEST(raft_assign, demoteToStandBy, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ASSIGN(0, 2, RAFT_STANDBY); + return MUNIT_OK; +} + +/* The leader can be demoted to stand-by and will no longer act as leader */ +TEST(raft_assign, demoteLeader, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ASSIGN_SUBMIT(0, 1, RAFT_STANDBY); + munit_assert_int(CLUSTER_LEADER, ==, 0); + ASSIGN_WAIT; + CLUSTER_STEP_UNTIL_HAS_LEADER(5000); + munit_assert_int(CLUSTER_LEADER, !=, 0); + return MUNIT_OK; +} + +/* The leader can be demoted to spare and will no longer act as leader */ +TEST(raft_assign, demoteLeaderToSpare, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ASSIGN_SUBMIT(0, 1, RAFT_SPARE); + munit_assert_int(CLUSTER_LEADER, ==, 0); + ASSIGN_WAIT; + CLUSTER_STEP_UNTIL_HAS_LEADER(5000); + munit_assert_int(CLUSTER_LEADER, !=, 0); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_barrier.c b/test/raft/integration/test_barrier.c new file mode 100644 index 000000000..8d95a8095 --- /dev/null +++ b/test/raft/integration/test_barrier.c @@ -0,0 +1,94 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(2); + CLUSTER_BOOTSTRAP; + CLUSTER_START; + CLUSTER_ELECT(0); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +struct result +{ + int status; + bool done; +}; + +static void barrierCbAssertResult(struct raft_barrier *req, int status) +{ + struct result *result = req->data; + munit_assert_int(status, ==, result->status); + result->done = true; +} + +static bool barrierCbHasFired(struct raft_fixture *f, void *arg) +{ + struct result *result = arg; + (void)f; + return result->done; +} + +/* Submit a barrier request. */ +#define BARRIER_SUBMIT(I) \ + struct raft_barrier _req; \ + struct result _result = {0, false}; \ + int _rv; \ + _req.data = &_result; \ + _rv = raft_barrier(CLUSTER_RAFT(I), &_req, barrierCbAssertResult); \ + munit_assert_int(_rv, ==, 0); + +/* Expect the barrier callback to fire with the given status. */ +#define BARRIER_EXPECT(STATUS) _result.status = STATUS + +/* Wait until the barrier request completes. */ +#define BARRIER_WAIT CLUSTER_STEP_UNTIL(barrierCbHasFired, &_result, 2000) + +/* Submit to the I'th server a barrier request and wait for the operation to + * succeed. */ +#define BARRIER(I) \ + do { \ + BARRIER_SUBMIT(I); \ + BARRIER_WAIT; \ + } while (0) + +/****************************************************************************** + * + * Success scenarios + * + *****************************************************************************/ + +SUITE(raft_barrier) + +TEST(raft_barrier, cb, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + BARRIER(0); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_bootstrap.c b/test/raft/integration/test_bootstrap.c new file mode 100644 index 000000000..43043f967 --- /dev/null +++ b/test/raft/integration/test_bootstrap.c @@ -0,0 +1,57 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture holding a pristine raft instance. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(1); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Bootstrap tests. + * + *****************************************************************************/ + +SUITE(raft_bootstrap) + +/* Attempting to bootstrap an instance that's already started results in + * RAFT_BUSY. */ +TEST(raft_bootstrap, busy, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft; + struct raft_configuration configuration; + int rv; + + /* Bootstrap and the first server. */ + CLUSTER_BOOTSTRAP_N_VOTING(1); + CLUSTER_START; + + raft = CLUSTER_RAFT(0); + CLUSTER_CONFIGURATION(&configuration); + rv = raft_bootstrap(raft, &configuration); + munit_assert_int(rv, ==, RAFT_BUSY); + raft_configuration_close(&configuration); + + return MUNIT_OK; +} diff --git a/test/raft/integration/test_digest.c b/test/raft/integration/test_digest.c new file mode 100644 index 000000000..98e5ee9e0 --- /dev/null +++ b/test/raft/integration/test_digest.c @@ -0,0 +1,14 @@ +#include "../../../src/raft.h" +#include "../lib/runner.h" + +SUITE(raft_digest) + +/* Generation of the ID of the bootstrap dqlite node. */ +TEST(raft_digest, bootstrapServerId, NULL, NULL, 0, NULL) +{ + const char *address = "127.0.0.1:65536"; + unsigned long long id; + id = raft_digest(address, 0); + munit_assert_int(id, ==, 138882483); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_election.c b/test/raft/integration/test_election.c new file mode 100644 index 000000000..d67b8e8ff --- /dev/null +++ b/test/raft/integration/test_election.c @@ -0,0 +1,800 @@ +#include "../../../src/raft/configuration.h" +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + unsigned i; + SETUP_CLUSTER(2); + CLUSTER_BOOTSTRAP; + for (i = 0; i < CLUSTER_N; i++) { + struct raft *raft = CLUSTER_RAFT(i); + raft->data = f; + } + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Parameters + * + *****************************************************************************/ + +static char *cluster_5[] = {"5", NULL}; + +static MunitParameterEnum cluster_5_params[] = { + {CLUSTER_N_PARAM, cluster_5}, + {NULL, NULL}, +}; + +static char *cluster_3[] = {"3", NULL}; + +static MunitParameterEnum cluster_3_params[] = { + {CLUSTER_N_PARAM, cluster_3}, + {NULL, NULL}, +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +/* Wait until the I'th server becomes candidate. */ +#define STEP_UNTIL_CANDIDATE(I) \ + CLUSTER_STEP_UNTIL_STATE_IS(I, RAFT_CANDIDATE, 2000) + +/* Wait until the I'th server becomes leader. */ +#define STEP_UNTIL_LEADER(I) CLUSTER_STEP_UNTIL_STATE_IS(I, RAFT_LEADER, 2000) + +/****************************************************************************** + * + * Assertions + * + *****************************************************************************/ + +/* Assert that the I'th server is in follower state. */ +#define ASSERT_FOLLOWER(I) munit_assert_int(CLUSTER_STATE(I), ==, RAFT_FOLLOWER) + +/* Assert that the I'th server is in candidate state. */ +#define ASSERT_CANDIDATE(I) \ + munit_assert_int(CLUSTER_STATE(I), ==, RAFT_CANDIDATE) + +/* Assert that the I'th server is in leader state. */ +#define ASSERT_LEADER(I) munit_assert_int(CLUSTER_STATE(I), ==, RAFT_LEADER) + +/* Assert that the I'th server is unavailable. */ +#define ASSERT_UNAVAILABLE(I) \ + munit_assert_int(CLUSTER_STATE(I), ==, RAFT_UNAVAILABLE) + +/* Assert that the I'th server has voted for the server with the given ID. */ +#define ASSERT_VOTED_FOR(I, ID) munit_assert_int(CLUSTER_VOTED_FOR(I), ==, ID) + +/* Assert that the I'th server has the given current term. */ +#define ASSERT_TERM(I, TERM) \ + { \ + struct raft *raft_ = CLUSTER_RAFT(I); \ + munit_assert_int(raft_->current_term, ==, TERM); \ + } + +/* Assert that the fixture time matches the given value */ +#define ASSERT_TIME(TIME) munit_assert_int(CLUSTER_TIME, ==, TIME) + +/****************************************************************************** + * + * Successful election round + * + *****************************************************************************/ + +SUITE(election) + +/* Test an election round with two voters. */ +TEST(election, twoVoters, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + CLUSTER_START; + + /* The first server eventually times out and converts to candidate. */ + STEP_UNTIL_CANDIDATE(0); + ASSERT_TIME(1000); + + CLUSTER_STEP; /* Server 1 tick */ + ASSERT_FOLLOWER(1); + + CLUSTER_STEP; /* Server 0 completes sending a RequestVote RPC */ + CLUSTER_STEP; /* Server 1 receives RequestVote RPC */ + ASSERT_VOTED_FOR(1, 1); + ASSERT_TIME(1015); + + CLUSTER_STEP; /* Server 1 completes sending RequestVote RPC */ + CLUSTER_STEP; /* Server 1 receives RequestVote RPC result */ + ASSERT_LEADER(0); + ASSERT_TIME(1030); + + return MUNIT_OK; +} + +/* If we have already voted and the same candidate requests the vote again, the + * vote is granted. */ +TEST(election, grantAgain, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + raft_fixture_set_randomized_election_timeout(&f->cluster, 1, 10000); + raft_set_election_timeout(CLUSTER_RAFT(1), 10000); + CLUSTER_START; + + /* The first server converts to candidate. */ + STEP_UNTIL_CANDIDATE(0); + ASSERT_TIME(1000); + + CLUSTER_STEP; /* Server 1 tick */ + ASSERT_FOLLOWER(1); + + /* Disconnect the second server, so the first server does not receive the + * result and eventually starts a new election round. */ + CLUSTER_SATURATE_BOTHWAYS(0, 1); + CLUSTER_STEP_UNTIL_TERM_IS(0, 3, 2000); + ASSERT_CANDIDATE(0); + ASSERT_TIME(2000); + + /* Reconnecting the two servers eventually makes the first server win the + * election. */ + CLUSTER_DESATURATE_BOTHWAYS(0, 1); + STEP_UNTIL_LEADER(0); + ASSERT_TIME(2030); + + return MUNIT_OK; +} + +/* If the requester last log entry index is the same, the vote is granted. */ +TEST(election, grantIfLastIndexIsSame, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry1; + struct raft_entry entry2; + (void)params; + + entry1.type = RAFT_COMMAND; + entry1.term = 1; + FsmEncodeSetX(1, &entry1.buf); + + entry2.type = RAFT_COMMAND; + entry2.term = 1; + FsmEncodeSetX(1, &entry2.buf); + + CLUSTER_ADD_ENTRY(0, &entry1); + CLUSTER_ADD_ENTRY(1, &entry2); + CLUSTER_SET_TERM(1, 2); + + CLUSTER_START; + + /* The first server converts to candidate. */ + STEP_UNTIL_CANDIDATE(0); + + /* The first server eventually receives a RequestVote result RPC and + * converts to leader */ + STEP_UNTIL_LEADER(0); + ASSERT_TIME(1030); + + return MUNIT_OK; +} + +/* If the requester last log entry index is higher, the vote is granted. */ +TEST(election, grantIfLastIndexIsHigher, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry; + (void)params; + + entry.type = RAFT_COMMAND; + entry.term = 1; + FsmEncodeSetX(1, &entry.buf); + + CLUSTER_ADD_ENTRY(0, &entry); + CLUSTER_SET_TERM(1, 2); + + CLUSTER_START; + + /* The first server converts to candidate. */ + STEP_UNTIL_CANDIDATE(0); + + /* The second server grants its vote. */ + CLUSTER_STEP_UNTIL_VOTED_FOR(1, 0, 2000); + + /* The first server receives a RequestVote result RPC and converts to + * leader */ + CLUSTER_STEP_N(2); + ASSERT_LEADER(0); + + return MUNIT_OK; +} + +/* If a candidate receives a vote request response granting the vote but the + * quorum is not reached, it stays candidate. */ +TEST(election, waitQuorum, setUp, tearDown, 0, cluster_5_params) +{ + struct fixture *f = data; + (void)params; + CLUSTER_START; + + /* The first server converts to candidate. */ + STEP_UNTIL_CANDIDATE(0); + + /* All servers grant their vote. */ + CLUSTER_STEP_UNTIL_VOTED_FOR(1, 0, 2000); + CLUSTER_STEP_UNTIL_VOTED_FOR(2, 0, 2000); + CLUSTER_STEP_UNTIL_VOTED_FOR(3, 0, 2000); + CLUSTER_STEP_UNTIL_VOTED_FOR(4, 0, 2000); + ASSERT_TIME(1015); + + /* The first server receives the first RequestVote result RPC but stays + * candidate since it has only 2 votes, and 3 are required. */ + CLUSTER_STEP_N(4); /* Send completes on all other servers */ + CLUSTER_STEP; /* First message is delivered */ + ASSERT_TIME(1030); + ASSERT_CANDIDATE(0); + + /* Eventually we are elected */ + CLUSTER_STEP; /* Second message is delivered */ + ASSERT_LEADER(0); /* Server 0 reaches the quorum */ + ASSERT_TIME(1030); + + return MUNIT_OK; +} + +/* The vote request gets rejected if our term is higher. */ +TEST(election, rejectIfHigherTerm, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + CLUSTER_SET_TERM(1, 3); + CLUSTER_START; + + /* The first server converts to candidate. */ + STEP_UNTIL_CANDIDATE(0); + + CLUSTER_STEP_N(3); /* Server 1 tick and RequestVote send/delivery */ + + /* The second server receives a RequestVote RPC and rejects the vote for the + * first server. */ + ASSERT_VOTED_FOR(1, 0); + + CLUSTER_STEP_N(2); /* RequestVote result send/delivery */ + + /* The first server receives the RequestVote result RPC and converts to + * follower because it discovers the newer term. */ + ASSERT_FOLLOWER(0); + + return 0; +} + +/* If the server already has a leader, the vote is not granted (even if the + * request has a higher term). */ +TEST(election, rejectIfHasLeader, setUp, tearDown, 0, cluster_3_params) +{ + struct fixture *f = data; + (void)params; + CLUSTER_START; + + /* Server 0 wins the elections. */ + STEP_UNTIL_LEADER(0); + + /* Server 2 gets disconnected and becomes candidate. */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + STEP_UNTIL_CANDIDATE(2); + + /* Server 2 stays candidate since its requests get rejected. */ + CLUSTER_STEP_N(20); + ASSERT_CANDIDATE(2); + + return MUNIT_OK; +} + +/* If a server has already voted, vote is not granted. */ +TEST(election, rejectIfAlreadyVoted, setUp, tearDown, 0, cluster_3_params) +{ + struct fixture *f = data; + (void)params; + + /* Disconnect server 1 from server 0 and change its randomized election + * timeout to match the one of server 0. This way server 1 will convert to + * candidate but not receive vote requests. */ + raft_fixture_set_randomized_election_timeout(&f->cluster, 1, 1000); + CLUSTER_SATURATE_BOTHWAYS(0, 1); + + CLUSTER_START; + + /* Server 0 and server 1 both become candidates. */ + STEP_UNTIL_CANDIDATE(0); + STEP_UNTIL_CANDIDATE(1); + ASSERT_TIME(1000); + + /* Server 2 receives the vote request from server 0 and grants it. */ + CLUSTER_STEP_UNTIL_VOTED_FOR(2, 0, 2000); + ASSERT_TIME(1015); + + /* Server 0 receives the vote result from server 2 and becomes leader. */ + STEP_UNTIL_LEADER(0); + ASSERT_TIME(1030); + + /* Server 1 is still candidate because its vote request got rejected. */ + ASSERT_CANDIDATE(1); + + return MUNIT_OK; +} + +/* If the requester last log entry term is lower than ours, the vote is not + * granted. */ +TEST(election, rejectIfLastTermIsLower, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry1; + struct raft_entry entry2; + (void)params; + + entry1.type = RAFT_COMMAND; + entry1.term = 1; + FsmEncodeSetX(123, &entry1.buf); + + entry2.type = RAFT_COMMAND; + entry2.term = 2; + FsmEncodeSetX(456, &entry2.buf); + + CLUSTER_ADD_ENTRY(0, &entry1); + CLUSTER_ADD_ENTRY(1, &entry2); + + CLUSTER_START; + + /* The first server becomes candidate. */ + STEP_UNTIL_CANDIDATE(0); + ASSERT_TIME(1000); + + /* The second server receives a RequestVote RPC and rejects the vote for the + * first server. */ + CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100); + ASSERT_VOTED_FOR(1, 0); + ASSERT_TIME(1015); + + /* The first server receives the response and stays candidate. */ + CLUSTER_STEP_UNTIL_DELIVERED(1, 0, 100); + ASSERT_CANDIDATE(0); + ASSERT_TIME(1030); + + /* Eventually the second server becomes leader because it has a longer + * log. */ + STEP_UNTIL_LEADER(1); + ASSERT_TIME(1130); + + return MUNIT_OK; +} + +/* If the requester last log entry index is the lower, the vote is not + * granted. */ +TEST(election, rejectIfLastIndexIsLower, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry; + (void)params; + + entry.type = RAFT_COMMAND; + entry.term = 2; + FsmEncodeSetX(123, &entry.buf); + + CLUSTER_ADD_ENTRY(1, &entry); + + CLUSTER_START; + + /* The first server becomes candidate. */ + STEP_UNTIL_CANDIDATE(0); + ASSERT_TIME(1000); + + /* The second server receives a RequestVote RPC and rejects the vote for the + * first server. */ + CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100); + ASSERT_VOTED_FOR(1, 0); + ASSERT_TIME(1015); + + /* The first server receives the response and stays candidate. */ + CLUSTER_STEP_UNTIL_DELIVERED(1, 0, 100); + ASSERT_CANDIDATE(0); + ASSERT_TIME(1030); + + /* Eventually the second server becomes leader because it has a longer + * log. */ + STEP_UNTIL_LEADER(1); + ASSERT_TIME(1130); + + return MUNIT_OK; +} + +static char *reject_not_voting_n[] = {"3", NULL}; +static char *reject_not_voting_n_voting[] = {"2", NULL}; + +static MunitParameterEnum reject_not_voting_params[] = { + {CLUSTER_N_PARAM, reject_not_voting_n}, + {CLUSTER_N_VOTING_PARAM, reject_not_voting_n_voting}, + {NULL, NULL}, +}; + +/* If we are not a voting server, the vote is not granted. */ +TEST(election, rejectIfNotVoter, setUp, tearDown, 0, reject_not_voting_params) +{ + struct fixture *f = data; + + /* Disconnect server 0 from server 1, so server 0 can't win the elections + * (since there are only 2 voting servers). */ + CLUSTER_SATURATE_BOTHWAYS(0, 1); + + CLUSTER_START; + + /* Server 0 becomes candidate. */ + STEP_UNTIL_CANDIDATE(0); + ASSERT_TIME(1000); + + /* Server 0 stays candidate because it can't reach a quorum. */ + CLUSTER_STEP_UNTIL_TERM_IS(0, 3, 2000); + ASSERT_CANDIDATE(0); + ASSERT_TIME(2000); + + return MUNIT_OK; +} + +/* If a candidate server receives a response indicating that the vote was not + * granted, nothing happens (e.g. the server has already voted for someone + * else). */ +TEST(election, receiveRejectResult, setUp, tearDown, 0, cluster_5_params) +{ + struct fixture *f = data; + (void)params; + + /* Lower the randomized election timeout of server 4, so it becomes + * candidate just after server 0 */ + raft_fixture_set_randomized_election_timeout(&f->cluster, 4, 1020); + + /* Disconnect server 0 from all others except server 1. */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + CLUSTER_SATURATE_BOTHWAYS(0, 3); + CLUSTER_SATURATE_BOTHWAYS(0, 4); + + /* Disconnect server 4 from all others except the server 1. */ + CLUSTER_SATURATE_BOTHWAYS(4, 0); + CLUSTER_SATURATE_BOTHWAYS(4, 2); + CLUSTER_SATURATE_BOTHWAYS(4, 3); + + CLUSTER_START; + + /* The server 0 becomes candidate, server 4 one is still follower. */ + STEP_UNTIL_CANDIDATE(0); + ASSERT_TIME(1000); + ASSERT_FOLLOWER(4); + + /* Server 1 receives a RequestVote RPC and grants its vote. */ + CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100); + ASSERT_TIME(1015); + ASSERT_VOTED_FOR(1, 1); + ASSERT_CANDIDATE(0); + ASSERT_FOLLOWER(4); + + /* Disconnect server 0 from server 1, so it doesn't receive further + * messages. */ + CLUSTER_SATURATE_BOTHWAYS(0, 1); + + /* Server 4 server eventually becomes candidate */ + STEP_UNTIL_CANDIDATE(4); + ASSERT_TIME(1100); + ASSERT_CANDIDATE(0); + + /* The second server receives a RequestVote RPC but rejects its vote since + * it has already voted. */ + CLUSTER_STEP_UNTIL_DELIVERED(4, 0, 100); + ASSERT_VOTED_FOR(1, 1); + ASSERT_CANDIDATE(0); + ASSERT_CANDIDATE(4); + + return MUNIT_OK; +} + +/* An I/O error occurs when converting to candidate. */ +TEST(election, ioErrorConvertTerm, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_START; + + raft_fixture_term_fault(&f->cluster, 0, 0); + CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_UNAVAILABLE, 2000); + + return MUNIT_OK; +} + +/* An I/O error occurs when converting to candidate. */ +TEST(election, ioErrorConvertVote, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_START; + + raft_fixture_vote_fault(&f->cluster, 0, 0); + CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_UNAVAILABLE, 2000); + + return MUNIT_OK; +} + +/* The I/O error occurs when sending a vote request, and gets ignored. */ +TEST(election, ioErrorSendVoteRequest, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_START; + + /* The first server fails to send a RequestVote RPC. */ + raft_fixture_send_fault(&f->cluster, 0, 0); + CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_LEADER, 5000); + + return MUNIT_OK; +} + +/* The I/O error occurs when the second node tries to persist its vote. */ +TEST(election, ioErrorPersistVote, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_START; + + /* The first server becomes candidate. */ + CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE, 2000); + + /* The second server receives a RequestVote RPC but fails to persist its + * vote. */ + raft_fixture_vote_fault(&f->cluster, 1, 0); + CLUSTER_STEP_UNTIL_STATE_IS(1, RAFT_UNAVAILABLE, 1000); + + return MUNIT_OK; +} + +/* Test an election round with two voters and pre-vote. */ +TEST(election, preVote, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + raft_set_pre_vote(CLUSTER_RAFT(0), true); + raft_set_pre_vote(CLUSTER_RAFT(1), true); + CLUSTER_START; + + /* The first server eventually times out and converts to candidate, but it + * does not increment its term yet.*/ + STEP_UNTIL_CANDIDATE(0); + ASSERT_TIME(1000); + ASSERT_TERM(0, 1); + + CLUSTER_STEP; /* Server 1 tick */ + ASSERT_FOLLOWER(1); + + CLUSTER_STEP; /* Server 0 completes sending a pre-vote RequestVote RPC */ + CLUSTER_STEP; /* Server 1 receives the pre-vote RequestVote RPC */ + ASSERT_TERM(1, 1); /* Server 1 does increment its term */ + ASSERT_VOTED_FOR(1, 0); /* Server 1 does not persist its vote */ + ASSERT_TIME(1015); + + CLUSTER_STEP; /* Server 1 completes sending pre-vote RequestVote result */ + CLUSTER_STEP; /* Server 0 receives the pre-vote RequestVote result */ + ASSERT_CANDIDATE(0); + ASSERT_TERM(0, 2); /* Server 0 has now incremented its term. */ + ASSERT_TIME(1030); + + CLUSTER_STEP; /* Server 1 completes sending an actual RequestVote RPC */ + CLUSTER_STEP; /* Server 1 receives the actual RequestVote RPC */ + ASSERT_TERM(1, 2); /* Server 1 does increment its term. */ + ASSERT_VOTED_FOR(1, 1); /* Server 1 does persists its vote */ + + CLUSTER_STEP; /* Server 1 completes sending actual RequestVote result */ + CLUSTER_STEP; /* Server 0 receives the actual RequestVote result */ + ASSERT_LEADER(0); + + return MUNIT_OK; +} + +/* A candidate receives votes then crashes. */ +TEST(election, preVoteWithcandidateCrash, setUp, tearDown, 0, cluster_3_params) +{ + struct fixture *f = data; + raft_set_pre_vote(CLUSTER_RAFT(0), true); + raft_set_pre_vote(CLUSTER_RAFT(1), true); + raft_set_pre_vote(CLUSTER_RAFT(2), true); + CLUSTER_START; + + /* The first server eventually times out and converts to candidate, but it + * does not increment its term yet.*/ + STEP_UNTIL_CANDIDATE(0); + ASSERT_TIME(1000); + ASSERT_TERM(0, 1); + + /* Server 1 and 2 ticks */ + CLUSTER_STEP_N(2); + ASSERT_FOLLOWER(1); + ASSERT_FOLLOWER(2); + + /* Server 0 completes sending a pre-vote RequestVote RPCs */ + CLUSTER_STEP_N(2); + + CLUSTER_STEP; /* Server 1 receives the pre-vote RequestVote RPC */ + ASSERT_TERM(1, 1); /* Server 1 does not increment its term */ + ASSERT_VOTED_FOR(1, 0); /* Server 1 does not persist its vote */ + ASSERT_TIME(1015); + + CLUSTER_STEP; /* Server 2 receives the pre-vote RequestVote RPC */ + ASSERT_TERM(2, 1); /* Server 2 does not increment its term */ + ASSERT_VOTED_FOR(2, 0); /* Server 1 does not persist its vote */ + ASSERT_TIME(1015); + + /* Server 1 and 2 complete sending pre-vote RequestVote results */ + CLUSTER_STEP_N(2); + + /* Server 0 receives the pre-vote RequestVote results */ + CLUSTER_STEP_N(2); + ASSERT_CANDIDATE(0); + ASSERT_TERM(0, 2); /* Server 0 has now incremented its term. */ + ASSERT_TIME(1030); + + /* Server 0 completes sending actual RequestVote RPCs */ + CLUSTER_STEP_N(2); + + CLUSTER_STEP; /* Server 1 receives the actual RequestVote RPC */ + ASSERT_TERM(1, 2); /* Server 1 does increment its term. */ + ASSERT_VOTED_FOR(1, 1); /* Server 1 does persists its vote */ + + CLUSTER_STEP; /* Server 2 receives the actual RequestVote RPC */ + ASSERT_TERM(2, 2); /* Server 2 does increment its term. */ + ASSERT_VOTED_FOR(2, 1); /* Server 2 does persists its vote */ + + /* Server 0 crashes. */ + CLUSTER_KILL(0); + + /* Server 1 times out and starts an election. + * It doesn't increment its term */ + STEP_UNTIL_CANDIDATE(1); + ASSERT_TIME(2200); + ASSERT_TERM(1, 2); + + /* Server 1 completes sending the pre-vote RequestVote RPCs and server 2 has + * received those RPCs. + * Since server 2 has no current leader (the leader crashed before sending a + * HeartBeat), it will grant its vote to server 1, but will not persist it + * due to pre-vote, it's persisted vote is still for Server 0 (id 1) */ + CLUSTER_STEP_N(5); + ASSERT_TERM(2, 2); /* Server 2 does not increment its term */ + ASSERT_VOTED_FOR(2, 1); + + /* Server 1 receives the pre-vote RequestVote Result */ + CLUSTER_STEP_N(2); + /* Server 1 increments it's term to start a non pre-vote election */ + ASSERT_TERM(1, 3); /* Server 1 has now incremented its term. */ + ASSERT_VOTED_FOR(1, 2); /* Server 1 has persisted its vote */ + ASSERT_TIME(2230); + + /* Server 1 completes sending actual RequestVote RPCs */ + CLUSTER_STEP_N(2); + /* Server 2 receives the actual RequestVote RPCs */ + CLUSTER_STEP_N(2); + ASSERT_VOTED_FOR(2, 2); /* Server 2 persists its vote */ + + /* Server 1 receives RequestVote RPCs results and becomes leader */ + CLUSTER_STEP_N(2); + ASSERT_LEADER(1); + return MUNIT_OK; +} + +/* Ensure delayed pre-vote responses are not counted towards the real election + * quorum. */ +TEST(election, preVoteNoStaleVotes, setUp, tearDown, 0, cluster_3_params) +{ + struct fixture *f = data; + raft_set_pre_vote(CLUSTER_RAFT(0), true); + raft_set_pre_vote(CLUSTER_RAFT(1), true); + raft_set_pre_vote(CLUSTER_RAFT(2), true); + + /* Server 2 is 1 term ahead of the other servers, this will allow it to send + * stale pre-vote responses that pass the term checks. */ + CLUSTER_SET_TERM(2, 2); + CLUSTER_START; + + /* The first server eventually times out and converts to candidate, but it + * does not increment its term yet.*/ + STEP_UNTIL_CANDIDATE(0); + ASSERT_TIME(1000); + ASSERT_TERM(0, 1); + + /* Server 1 and 2 ticks */ + CLUSTER_STEP_N(2); + ASSERT_FOLLOWER(1); + ASSERT_FOLLOWER(2); + + /* Server 0 completes sending a pre-vote RequestVote RPCs */ + CLUSTER_STEP_N(2); + + CLUSTER_STEP; /* Server 1 receives the pre-vote RequestVote RPC */ + ASSERT_TERM(1, 1); /* Server 1 does not increment its term */ + ASSERT_VOTED_FOR(1, 0); /* Server 1 does not persist its vote */ + ASSERT_TIME(1015); + + CLUSTER_STEP; /* Server 2 receives the pre-vote RequestVote RPC */ + ASSERT_TERM(2, 2); /* Server 2 does not increment its term */ + ASSERT_VOTED_FOR(2, 0); /* Server 1 does not persist its vote */ + ASSERT_TIME(1015); + + /* Slow down responses of Server 2 */ + CLUSTER_SET_NETWORK_LATENCY(2, 100); + + /* Server 1 completes sending pre-vote RequestVote results */ + CLUSTER_STEP_N(2); + + /* Server 0 receives the pre-vote RequestVote results */ + CLUSTER_STEP_N(2); + ASSERT_CANDIDATE(0); + ASSERT_TERM(0, 2); /* Server 0 has now incremented its term. */ + ASSERT_TIME(1030); + + /* Don't send messages from 0, this ensures no real RequestVote RPCs are + * sent */ + CLUSTER_SATURATE(0, 1); + CLUSTER_SATURATE(0, 2); + + /* Wait until all messages from 2 to 0 are delivered */ + CLUSTER_STEP_UNTIL_DELIVERED(2, 0, 100); + + /* Make sure we haven't counted the pre-vote result as a real vote */ + ASSERT_CANDIDATE(0); + return MUNIT_OK; +} + +/* A follower doesn't convert to candidate while waiting for log entries to be + * persisted. */ +TEST(election, inFlightAppendBlocksCandidacy, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply req; + + /* Server 1 takes a long time to persist entries. */ + CLUSTER_SET_DISK_LATENCY(1, 10000); + + CLUSTER_START; + + /* Server 0 is the leader. It replicates a log entry. */ + CLUSTER_ELECT(0); + CLUSTER_APPLY_ADD_X(0, &req, 1, NULL); + + /* Server 1 receives the entry. */ + CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 1000); + + /* Contact is lost between servers 0 and 1. */ + CLUSTER_SATURATE(0, 1); + CLUSTER_SATURATE(1, 0); + + /* Several election timeouts lapse, but server 1 does not become a + * candidate, because it's waiting for the entry to be persisted. */ + CLUSTER_STEP_UNTIL_ELAPSED(5000); + munit_assert_int(CLUSTER_STATE(1), ==, RAFT_FOLLOWER); + + /* Eventually, server 1 finishes persisting the entry and becomes a + * candidate. */ + CLUSTER_STEP_UNTIL_STATE_IS(1, RAFT_CANDIDATE, 10000); + + return MUNIT_OK; +} diff --git a/test/raft/integration/test_fixture.c b/test/raft/integration/test_fixture.c new file mode 100644 index 000000000..c693ea273 --- /dev/null +++ b/test/raft/integration/test_fixture.c @@ -0,0 +1,306 @@ +#include "../../../src/raft.h" +#include "../lib/fsm.h" +#include "../lib/heap.h" +#include "../lib/runner.h" + +#define N_SERVERS 3 + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_HEAP; + struct raft_fsm fsms[N_SERVERS]; + struct raft_fixture fixture; +}; + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_calloc(1, sizeof *f); + struct raft_configuration configuration; + unsigned i; + int rc; + SET_UP_HEAP; + for (i = 0; i < N_SERVERS; i++) { + FsmInit(&f->fsms[i], 2); + } + + rc = raft_fixture_init(&f->fixture); + munit_assert_int(rc, ==, 0); + + for (i = 0; i < N_SERVERS; i++) { + rc = raft_fixture_grow(&f->fixture, &f->fsms[i]); + munit_assert_int(rc, ==, 0); + } + + rc = raft_fixture_configuration(&f->fixture, N_SERVERS, &configuration); + munit_assert_int(rc, ==, 0); + + rc = raft_fixture_bootstrap(&f->fixture, &configuration); + munit_assert_int(rc, ==, 0); + + raft_configuration_close(&configuration); + + rc = raft_fixture_start(&f->fixture); + munit_assert_int(rc, ==, 0); + + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + unsigned i; + raft_fixture_close(&f->fixture); + for (i = 0; i < N_SERVERS; i++) { + FsmClose(&f->fsms[i]); + } + TEAR_DOWN_HEAP; + free(f); +} + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +#define GET(I) raft_fixture_get(&f->fixture, I) +#define STEP raft_fixture_step(&f->fixture) +#define STEP_N(N) raft_fixture_step_n(&f->fixture, N) +#define STEP_UNTIL_STATE_IS(I, STATE) \ + { \ + bool done_; \ + done_ = raft_fixture_step_until_state_is(&f->fixture, I, STATE, 2000); \ + munit_assert_true(done_); \ + } +#define STATE(I) raft_state(GET(I)) +#define ELECT(I) raft_fixture_elect(&f->fixture, I) +#define DEPOSE raft_fixture_depose(&f->fixture) +#define APPLY(I, REQ) \ + { \ + struct raft_buffer buf; \ + int rc; \ + FsmEncodeAddX(1, &buf); \ + rc = raft_apply(GET(I), REQ, &buf, 1, NULL); \ + munit_assert_int(rc, ==, 0); \ + } +#define STEP_UNTIL_APPLIED(INDEX) \ + raft_fixture_step_until_applied(&f->fixture, N_SERVERS, INDEX, INDEX * 1000) + +/****************************************************************************** + * + * Assertions + * + *****************************************************************************/ + +/* Assert that the fixture time matches the given value */ +#define ASSERT_TIME(TIME) \ + munit_assert_int(raft_fixture_time(&f->fixture), ==, TIME) + +/* Assert that the I'th server is in the given state. */ +#define ASSERT_STATE(I, S) munit_assert_int(STATE(I), ==, S) + +/* Assert that the x field of the FSM with the given index matches the given + * value. */ +#define ASSERT_FSM_X(I, VALUE) munit_assert_int(FsmGetX(&f->fsms[I]), ==, VALUE) + +/****************************************************************************** + * + * raft_fixture_step + * + *****************************************************************************/ + +SUITE(raft_fixture_step) + +/* If there is no disk I/O in progress or network messages in flight, the tick + * callbacks are called. */ +TEST(raft_fixture_step, tick, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_fixture_event *event; + (void)params; + + ASSERT_TIME(0); + + event = STEP; + munit_assert_int(raft_fixture_event_server_index(event), ==, 0); + munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK); + ASSERT_TIME(100); + + event = STEP; + munit_assert_int(raft_fixture_event_server_index(event), ==, 1); + munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK); + ASSERT_TIME(100); + + event = STEP; + munit_assert_int(raft_fixture_event_server_index(event), ==, 2); + munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK); + ASSERT_TIME(100); + + event = STEP; + munit_assert_int(raft_fixture_event_server_index(event), ==, 0); + munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK); + ASSERT_TIME(200); + + return MUNIT_OK; +} + +/* By default the election timeout of server 0 is the first to expire . */ +TEST(raft_fixture_step, electionTimeout, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_fixture_event *event; + (void)params; + event = STEP_N(28); + munit_assert_int(raft_fixture_event_server_index(event), ==, 0); + munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK); + ASSERT_TIME(1000); + ASSERT_STATE(0, RAFT_CANDIDATE); + ASSERT_STATE(1, RAFT_FOLLOWER); + ASSERT_STATE(2, RAFT_FOLLOWER); + munit_log(MUNIT_LOG_INFO, "done"); + return MUNIT_OK; +} + +/* Send requests are flushed immediately. */ +TEST(raft_fixture_step, flushSend, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_fixture_event *event; + (void)params; + STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE); + event = STEP; + munit_assert_int(raft_fixture_event_server_index(event), ==, 0); + munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_NETWORK); + ASSERT_TIME(1000); + event = STEP; + munit_assert_int(raft_fixture_event_server_index(event), ==, 0); + munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_NETWORK); + ASSERT_TIME(1000); + return MUNIT_OK; +} + +/* Messages are delivered according to the current network latency. */ +TEST(raft_fixture_step, deliver, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_fixture_event *event; + (void)params; + STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE); /* Server 0 starts election */ + STEP_N(2); /* Server 0 sends 2 RequestVote */ + STEP_N(2); /* Ticks for server 1 and 2 */ + ASSERT_TIME(1000); + event = STEP; + munit_assert_int(raft_fixture_event_server_index(event), ==, 0); + munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_NETWORK); + ASSERT_TIME(1015); + return MUNIT_OK; +} + +/****************************************************************************** + * + * raft_fixture_elect + * + *****************************************************************************/ + +SUITE(raft_fixture_elect) + +/* Trigger the election of the first server. */ +TEST(raft_fixture_elect, first, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ELECT(0); + ASSERT_STATE(0, RAFT_LEADER); + ASSERT_STATE(1, RAFT_FOLLOWER); + ASSERT_STATE(2, RAFT_FOLLOWER); + return MUNIT_OK; +} + +/* Trigger the election of the second server. */ +TEST(raft_fixture_elect, second, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ELECT(1); + ASSERT_STATE(0, RAFT_FOLLOWER); + ASSERT_STATE(1, RAFT_LEADER); + ASSERT_STATE(2, RAFT_FOLLOWER); + return MUNIT_OK; +} + +/* Trigger an election change. */ +TEST(raft_fixture_elect, change, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ELECT(0); + DEPOSE; + ASSERT_STATE(0, RAFT_FOLLOWER); + ASSERT_STATE(1, RAFT_FOLLOWER); + ASSERT_STATE(2, RAFT_FOLLOWER); + ELECT(1); + ASSERT_STATE(0, RAFT_FOLLOWER); + ASSERT_STATE(1, RAFT_LEADER); + ASSERT_STATE(2, RAFT_FOLLOWER); + return MUNIT_OK; +} + +/* Trigger an election that re-elects the same node. */ +TEST(raft_fixture_elect, again, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ELECT(0); + DEPOSE; + ASSERT_STATE(0, RAFT_FOLLOWER); + ASSERT_STATE(1, RAFT_FOLLOWER); + ASSERT_STATE(2, RAFT_FOLLOWER); + ELECT(0); + ASSERT_STATE(0, RAFT_LEADER); + ASSERT_STATE(1, RAFT_FOLLOWER); + ASSERT_STATE(2, RAFT_FOLLOWER); + return MUNIT_OK; +} + +/****************************************************************************** + * + * raft_fixture_step_until_applied + * + *****************************************************************************/ + +SUITE(raft_fixture_step_until_applied) + +/* Wait for one entry to be applied. */ +TEST(raft_fixture_step_until_applied, one, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply *req = munit_malloc(sizeof *req); + ELECT(0); + APPLY(0, req); + STEP_UNTIL_APPLIED(3); + ASSERT_FSM_X(0, 1); + ASSERT_FSM_X(1, 1); + ASSERT_FSM_X(2, 1); + free(req); + return MUNIT_OK; +} + +/* Wait for two entries to be applied. */ +TEST(raft_fixture_step_until_applied, two, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply *req1 = munit_malloc(sizeof *req1); + struct raft_apply *req2 = munit_malloc(sizeof *req2); + ELECT(0); + APPLY(0, req1); + APPLY(0, req2); + STEP_UNTIL_APPLIED(4); + ASSERT_FSM_X(0, 2); + ASSERT_FSM_X(1, 2); + ASSERT_FSM_X(2, 2); + free(req1); + free(req2); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_heap.c b/test/raft/integration/test_heap.c new file mode 100644 index 000000000..a6265cbeb --- /dev/null +++ b/test/raft/integration/test_heap.c @@ -0,0 +1,53 @@ +#include "../../../src/raft.h" + +#include "../lib/runner.h" + +/****************************************************************************** + * + * Default heap functions + * + *****************************************************************************/ + +SUITE(raft_heap) + +TEST(raft_heap, malloc, NULL, NULL, 0, NULL) +{ + void *p; + p = raft_malloc(8); + munit_assert_ptr_not_null(p); + raft_free(p); + return MUNIT_OK; +} + +TEST(raft_heap, calloc, NULL, NULL, 0, NULL) +{ + void *p; + p = raft_calloc(1, 8); + munit_assert_ptr_not_null(p); + munit_assert_int(*(uint64_t *)p, ==, 0); + raft_free(p); + return MUNIT_OK; +} + +TEST(raft_heap, realloc, NULL, NULL, 0, NULL) +{ + void *p; + p = raft_realloc(NULL, 8); + munit_assert_ptr_not_null(p); + *(uint64_t *)p = 1; + p = raft_realloc(p, 16); + munit_assert_ptr_not_null(p); + munit_assert_int(*(uint64_t *)p, ==, 1); + raft_free(p); + return MUNIT_OK; +} + +TEST(raft_heap, aligned_alloc, NULL, NULL, 0, NULL) +{ + void *p; + p = raft_aligned_alloc(1024, 2048); + munit_assert_ptr_not_null(p); + munit_assert_int((uintptr_t)p % 1024, ==, 0); + raft_free(p); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_init.c b/test/raft/integration/test_init.c new file mode 100644 index 000000000..512864d2c --- /dev/null +++ b/test/raft/integration/test_init.c @@ -0,0 +1,85 @@ +#include "../../../src/raft.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * raft_init + * + *****************************************************************************/ + +SUITE(raft_init) + +/* Incompatible raft->io and raft->fsm wrt async snapshots. */ +TEST(raft_init, incompatIoFsmAsyncSnapshotNotNull, NULL, NULL, 0, NULL) +{ + /* Set incompatible io and fsm versions and non-NULL snapshot_async fn */ + struct raft r = {0}; + struct raft_io io = {0}; + struct raft_fsm fsm = {0}; + io.version = 1; /* Too low */ + io.async_work = (int (*)(struct raft_io *, struct raft_io_async_work *, + raft_io_async_work_cb))(uintptr_t)0xDEADBEEF; + fsm.version = 3; + fsm.snapshot_async = (int (*)(struct raft_fsm *, struct raft_buffer **, + unsigned int *))(uintptr_t)0xDEADBEEF; + + int rc; + rc = raft_init(&r, &io, &fsm, 1, "1"); + munit_assert_int(rc, ==, -1); + munit_assert_string_equal( + r.errmsg, + "async snapshot requires io->version > 1 and async_work method."); + return MUNIT_OK; +} + +/* Incompatible raft->io and raft->fsm wrt async snapshots. */ +TEST(raft_init, incompatIoFsmAsyncSnapshotNull, NULL, NULL, 0, NULL) +{ + /* Set incompatible io and fsm versions and NULL snapshot_async fn */ + struct raft r = {0}; + struct raft_io io = {0}; + struct raft_fsm fsm = {0}; + io.version = 2; + io.async_work = NULL; + fsm.version = 3; + fsm.snapshot_async = (int (*)(struct raft_fsm *, struct raft_buffer **, + unsigned int *))(uintptr_t)0xDEADBEEF; + + int rc; + rc = raft_init(&r, &io, &fsm, 1, "1"); + munit_assert_int(rc, ==, -1); + munit_assert_string_equal( + r.errmsg, + "async snapshot requires io->version > 1 and async_work method."); + return MUNIT_OK; +} + +TEST(raft_init, ioVersionNotSet, NULL, NULL, 0, NULL) +{ + struct raft r = {0}; + struct raft_io io = {0}; + struct raft_fsm fsm = {0}; + io.version = 0; + fsm.version = 3; + + int rc; + rc = raft_init(&r, &io, &fsm, 1, "1"); + munit_assert_int(rc, ==, -1); + munit_assert_string_equal(r.errmsg, "io->version must be set"); + return MUNIT_OK; +} + +TEST(raft_init, fsmVersionNotSet, NULL, NULL, 0, NULL) +{ + struct raft r = {0}; + struct raft_io io = {0}; + struct raft_fsm fsm = {0}; + io.version = 2; + fsm.version = 0; + + int rc; + rc = raft_init(&r, &io, &fsm, 1, "1"); + munit_assert_int(rc, ==, -1); + munit_assert_string_equal(r.errmsg, "fsm->version must be set"); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_membership.c b/test/raft/integration/test_membership.c new file mode 100644 index 000000000..53d43aea9 --- /dev/null +++ b/test/raft/integration/test_membership.c @@ -0,0 +1,317 @@ +#include "../../../src/raft/configuration.h" +#include "../../../src/raft/progress.h" +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; + struct raft_change req; +}; + +/* Set up a cluster of 2 servers, with the first as leader. */ +static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(2); + CLUSTER_BOOTSTRAP; + CLUSTER_START; + CLUSTER_ELECT(0); + return f; +} + +static void tear_down(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +/* Add a an empty server to the cluster and start it. */ +#define GROW \ + { \ + int rv__; \ + CLUSTER_GROW; \ + rv__ = raft_start(CLUSTER_RAFT(2)); \ + munit_assert_int(rv__, ==, 0); \ + } + +/* Invoke raft_add against the I'th node and assert it returns the given + * value. */ +#define ADD(I, ID, RV) \ + { \ + int rv_; \ + char address_[16]; \ + sprintf(address_, "%d", ID); \ + rv_ = raft_add(CLUSTER_RAFT(I), &f->req, ID, address_, NULL); \ + munit_assert_int(rv_, ==, RV); \ + } + +/* Submit a request to assign the given ROLE to the server with the given ID. */ +#define ASSIGN(I, ID, ROLE) \ + { \ + int _rv; \ + _rv = raft_assign(CLUSTER_RAFT(I), &f->req, ID, ROLE, NULL); \ + munit_assert_int(_rv, ==, 0); \ + } + +/* Invoke raft_remove against the I'th node and assert it returns the given + * value. */ +#define REMOVE(I, ID, RV) \ + { \ + int rv_; \ + rv_ = raft_remove(CLUSTER_RAFT(I), &f->req, ID, NULL); \ + munit_assert_int(rv_, ==, RV); \ + } + +struct result +{ + int status; + bool done; +}; + +/* Submit an apply request. */ +#define APPLY_SUBMIT(I) \ + struct raft_buffer _buf; \ + struct raft_apply _req; \ + struct result _result = {0, false}; \ + int _rv; \ + FsmEncodeSetX(123, &_buf); \ + _req.data = &_result; \ + _rv = raft_apply(CLUSTER_RAFT(I), &_req, &_buf, 1, NULL); \ + munit_assert_int(_rv, ==, 0); + +/****************************************************************************** + * + * Assertions + * + *****************************************************************************/ + +/* Assert the values of the committed and uncommitted configuration indexes on + * the raft instance with the given index. */ +#define ASSERT_CONFIGURATION_INDEXES(I, COMMITTED, UNCOMMITTED) \ + { \ + struct raft *raft_ = CLUSTER_RAFT(I); \ + munit_assert_int(raft_->configuration_committed_index, ==, COMMITTED); \ + munit_assert_int(raft_->configuration_uncommitted_index, ==, \ + UNCOMMITTED); \ + } + +/****************************************************************************** + * + * raft_add + * + *****************************************************************************/ + +SUITE(raft_add) + +/* After a request to add a new non-voting server is committed, the new + * configuration is not marked as uncommitted anymore */ +TEST(raft_add, committed, setup, tear_down, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft = CLUSTER_RAFT(0); + const struct raft_server *server; + ADD(0 /* I */, + 3 /* ID */, 0); + + /* The new configuration is already effective. */ + munit_assert_int(raft->configuration.n, ==, 3); + server = &raft->configuration.servers[2]; + munit_assert_int(server->id, ==, 3); + munit_assert_string_equal(server->address, "3"); + munit_assert_int(server->role, ==, RAFT_SPARE); + + /* The new configuration is marked as uncommitted. */ + ASSERT_CONFIGURATION_INDEXES(0, 1, 3); + + /* The next/match indexes now include an entry for the new server. */ + munit_assert_int(raft->leader_state.progress[2].next_index, ==, 4); + munit_assert_int(raft->leader_state.progress[2].match_index, ==, 0); + + CLUSTER_STEP_UNTIL_APPLIED(0, 3, 2000); + ASSERT_CONFIGURATION_INDEXES(0, 3, 0); + + /* The new configuration is marked as committed. */ + + return MUNIT_OK; +} + +/* Trying to add a server on a node which is not the leader results in an + * error. */ +TEST(raft_add, notLeader, setup, tear_down, 0, NULL) +{ + struct fixture *f = data; + ADD(1 /* I */, + 3 /* ID */, + RAFT_NOTLEADER); + return MUNIT_OK; +} + +/* Trying to add a server while a configuration change is already in progress + * results in an error. */ +TEST(raft_add, busy, setup, tear_down, 0, NULL) +{ + struct fixture *f = data; + ADD(0 /* I */, + 3 /* ID */, 0); + ADD(0 /* I */, + 4 /* ID */, + RAFT_CANTCHANGE); + munit_log(MUNIT_LOG_INFO, "done"); + return MUNIT_OK; +} + +/* Trying to add a server with an ID which is already in use results in an + * error. */ +TEST(raft_add, duplicateId, setup, tear_down, 0, NULL) +{ + struct fixture *f = data; + ADD(0 /* I */, + 2 /* ID */, + RAFT_DUPLICATEID); + return MUNIT_OK; +} + +/****************************************************************************** + * + * raft_remove + * + *****************************************************************************/ + +SUITE(raft_remove) + +/* After a request to remove server is committed, the new configuration is not + * marked as uncommitted anymore */ +TEST(raft_remove, committed, setup, tear_down, 0, NULL) +{ + struct fixture *f = data; + GROW; + ADD(0, 3, 0); + CLUSTER_STEP_UNTIL_APPLIED(0, 3, 2000); + ASSIGN(0, 3, RAFT_STANDBY); + CLUSTER_STEP_UNTIL_APPLIED(2, 2, 2000); + CLUSTER_STEP_N(2); + REMOVE(0, 3, 0); + ASSERT_CONFIGURATION_INDEXES(0, 4, 5); + CLUSTER_STEP_UNTIL_APPLIED(0, 5, 2000); + ASSERT_CONFIGURATION_INDEXES(0, 5, 0); + munit_assert_int(CLUSTER_RAFT(0)->configuration.n, ==, 2); + return MUNIT_OK; +} + +/* A leader gets a request to remove itself. */ +TEST(raft_remove, self, setup, tear_down, 0, NULL) +{ + struct fixture *f = data; + REMOVE(0, 1, 0); + CLUSTER_STEP_UNTIL_APPLIED(0, 2, 2000); + CLUSTER_STEP_UNTIL_APPLIED(1, 2, 10000); + return MUNIT_OK; +} + +/* A leader gets a request to remove itself from a 3-node cluster */ +TEST(raft_remove, selfThreeNodeClusterReplicate, setup, tear_down, 0, NULL) +{ + struct fixture *f = data; + /* Add a third node */ + GROW; + ADD(0, 3, 0); + CLUSTER_STEP_UNTIL_APPLIED(0, 3, 2000); + ASSIGN(0, 3, RAFT_VOTER); + CLUSTER_STEP_UNTIL_APPLIED(0, 4, 2000); + + /* Verify node with id 1 is the leader */ + raft_id leader_id = 0xDEADBEEF; + const char *leader_address = NULL; + raft_leader(CLUSTER_RAFT(0), &leader_id, &leader_address); + munit_assert_ulong(leader_id, ==, 1); + munit_assert_ptr_not_null(leader_address); + + /* The leader is requested to remove itself from the configuration */ + REMOVE(0, 1, 0); + + /* The - removed - leader should still replicate entries. + * + * Raft dissertation 4.2.2 + * `First, there will be a period of time (while it is committing Cnew) when + * a leader can manage a cluster that does not include itself; it replicates + * log entries but does not count itself in majorities.` + * + * */ + APPLY_SUBMIT(0) + + /* The removed leader eventually steps down */ + CLUSTER_STEP_UNTIL_HAS_NO_LEADER(5000); + raft_leader(CLUSTER_RAFT(0), &leader_id, &leader_address); + munit_assert_ulong(leader_id, ==, 0); + munit_assert_ptr_null(leader_address); + + /* The original leader has applied the REMOVE entry */ + CLUSTER_STEP_UNTIL_APPLIED(0, 5, 10000); + + /* At this point the other nodes have replicated the new config, but have + * not yet applied it, they miss a heartbeat from the leader informing them + * of the commit index of the new config.*/ + + /* A new leader is elected */ + CLUSTER_STEP_UNTIL_HAS_LEADER(5000); + + /* The other nodes applied the barrier after + * the config change and therefore commit the new config . */ + CLUSTER_STEP_UNTIL_APPLIED(1, 6, 10000); + CLUSTER_STEP_UNTIL_APPLIED(2, 6, 10000); + + /* The removed leader doesn't know who the leader is */ + raft_leader(CLUSTER_RAFT(0), &leader_id, &leader_address); + munit_assert_ulong(leader_id, ==, 0); + munit_assert_ptr_null(leader_address); + + /* The new configuration has a leader */ + raft_leader(CLUSTER_RAFT(1), &leader_id, &leader_address); + munit_assert_ulong(leader_id, !=, 0); + munit_assert_ulong(leader_id, !=, 1); + munit_assert_ptr_not_null(leader_address); + return MUNIT_OK; +} + +/* Trying to remove a server on a node which is not the leader results in an + * error. */ +TEST(raft_remove, notLeader, setup, tear_down, 0, NULL) +{ + struct fixture *f = data; + REMOVE(1 /* I */, + 3 /* ID */, + RAFT_NOTLEADER); + return MUNIT_OK; +} + +/* Trying to remove a server while a configuration change is already in progress + * results in an error. */ +TEST(raft_remove, inProgress, setup, tear_down, 0, NULL) +{ + struct fixture *f = data; + ADD(0, 3, 0); + REMOVE(0, 3, RAFT_CANTCHANGE); + return MUNIT_OK; +} + +/* Trying to remove a server with an unknown ID results in an error. */ +TEST(raft_remove, badId, setup, tear_down, 0, NULL) +{ + struct fixture *f = data; + REMOVE(0, 3, RAFT_BADID); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_recover.c b/test/raft/integration/test_recover.c new file mode 100644 index 000000000..26f036857 --- /dev/null +++ b/test/raft/integration/test_recover.c @@ -0,0 +1,56 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture holding a bootstrapped raft cluster. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(3); + CLUSTER_BOOTSTRAP; + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Recover tests. + * + *****************************************************************************/ + +SUITE(raft_recover) + +/* Attempting to recover a running instance results in RAFT_BUSY. */ +TEST(raft_recover, busy, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft; + struct raft_configuration configuration; + int rv; + + /* Start all servers. */ + CLUSTER_START; + + raft = CLUSTER_RAFT(0); + CLUSTER_CONFIGURATION(&configuration); + rv = raft_recover(raft, &configuration); + munit_assert_int(rv, ==, RAFT_BUSY); + raft_configuration_close(&configuration); + + return MUNIT_OK; +} diff --git a/test/raft/integration/test_replication.c b/test/raft/integration/test_replication.c new file mode 100644 index 000000000..c971a78a7 --- /dev/null +++ b/test/raft/integration/test_replication.c @@ -0,0 +1,1280 @@ +#include "../../../src/raft/configuration.h" +#include "../../../src/raft/flags.h" +#include "../../../src/raft/progress.h" +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +/* Standard startup sequence, bootstrapping the cluster and electing server 0 */ +#define BOOTSTRAP_START_AND_ELECT \ + CLUSTER_BOOTSTRAP; \ + CLUSTER_START; \ + CLUSTER_ELECT(0); \ + ASSERT_TIME(1045) + +/****************************************************************************** + * + * Set up a cluster with a two servers. + * + *****************************************************************************/ + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(2); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Assertions + * + *****************************************************************************/ + +/* Assert that the I'th server is in follower state. */ +#define ASSERT_FOLLOWER(I) munit_assert_int(CLUSTER_STATE(I), ==, RAFT_FOLLOWER) + +/* Assert that the I'th server is in candidate state. */ +#define ASSERT_CANDIDATE(I) \ + munit_assert_int(CLUSTER_STATE(I), ==, RAFT_CANDIDATE) + +/* Assert that the I'th server is in leader state. */ +#define ASSERT_LEADER(I) munit_assert_int(CLUSTER_STATE(I), ==, RAFT_LEADER) + +/* Assert that the fixture time matches the given value */ +#define ASSERT_TIME(TIME) munit_assert_int(CLUSTER_TIME, ==, TIME) + +/* Assert that the configuration of the I'th server matches the given one */ +#define ASSERT_CONFIGURATION(I, EXPECTED) \ + do { \ + struct raft *_raft = CLUSTER_RAFT(I); \ + struct raft_configuration *_actual = &_raft->configuration; \ + unsigned _i; \ + \ + munit_assert_uint(_actual->n, ==, (EXPECTED)->n); \ + for (_i = 0; _i < _actual->n; _i++) { \ + struct raft_server *_server1 = &_actual->servers[_i]; \ + struct raft_server *_server2 = &(EXPECTED)->servers[_i]; \ + munit_assert_ulong(_server1->id, ==, _server2->id); \ + munit_assert_int(_server1->role, ==, _server2->role); \ + munit_assert_string_equal(_server1->address, _server2->address); \ + } \ + } while (0) + +/****************************************************************************** + * + * Log replication. + * + *****************************************************************************/ + +SUITE(replication) + +/* A leader sends a heartbeat message as soon as it gets elected. */ +TEST(replication, sendInitialHeartbeat, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft; + CLUSTER_BOOTSTRAP; + CLUSTER_START; + + /* Server 0 becomes candidate and sends vote requests after the election + * timeout. */ + CLUSTER_STEP_N(19); + ASSERT_TIME(1000); + ASSERT_CANDIDATE(0); + + /* Server 0 receives the vote result, becomes leader and sends + * heartbeats. */ + CLUSTER_STEP_N(6); + ASSERT_LEADER(0); + ASSERT_TIME(1030); + raft = CLUSTER_RAFT(0); + munit_assert_int(raft->leader_state.progress[1].last_send, ==, 1030); + + /* Server 1 receives the heartbeat from server 0 and resets its election + * timer. */ + raft = CLUSTER_RAFT(1); + munit_assert_int(raft->election_timer_start, ==, 1015); + CLUSTER_STEP_N(2); + munit_assert_int(raft->election_timer_start, ==, 1045); + + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1); + munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 1); + + return MUNIT_OK; +} + +/* After receiving an AppendEntriesResult, a leader has set the feature flags of + * a node. */ +TEST(replication, receiveFlags, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft; + CLUSTER_BOOTSTRAP; + CLUSTER_START; + + /* Server 0 becomes leader and sends the initial heartbeat. */ + CLUSTER_STEP_N(24); + ASSERT_LEADER(0); + ASSERT_TIME(1030); + + /* Flags is empty */ + raft = CLUSTER_RAFT(0); + munit_assert_ullong(raft->leader_state.progress[1].features, ==, 0); + + raft = CLUSTER_RAFT(1); + /* Server 1 receives the first heartbeat. */ + CLUSTER_STEP_N(4); + munit_assert_int(raft->election_timer_start, ==, 1045); + munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 1); + + /* Server 0 receives the reply to the heartbeat. */ + CLUSTER_STEP_N(2); + munit_assert_int(CLUSTER_N_RECV(0, RAFT_IO_APPEND_ENTRIES_RESULT), ==, 1); + raft = CLUSTER_RAFT(0); + munit_assert_ullong(raft->leader_state.progress[1].features, ==, + RAFT_DEFAULT_FEATURE_FLAGS); + + return MUNIT_OK; +} + +/* A leader keeps sending heartbeat messages at regular intervals to + * maintain leadership. */ +TEST(replication, sendFollowupHeartbeat, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft; + CLUSTER_BOOTSTRAP; + CLUSTER_START; + + /* Server 0 becomes leader and sends the initial heartbeat. */ + CLUSTER_STEP_N(24); + ASSERT_LEADER(0); + ASSERT_TIME(1030); + + raft = CLUSTER_RAFT(1); + + /* Server 1 receives the first heartbeat. */ + CLUSTER_STEP_N(4); + munit_assert_int(raft->election_timer_start, ==, 1045); + munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 1); + + /* Server 1 receives the second heartbeat. */ + CLUSTER_STEP_N(8); + munit_assert_int(raft->election_timer_start, ==, 1215); + munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 2); + + /* Server 1 receives the third heartbeat. */ + CLUSTER_STEP_N(7); + munit_assert_int(raft->election_timer_start, ==, 1315); + munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 3); + + /* Server 1 receives the fourth heartbeat. */ + CLUSTER_STEP_N(7); + munit_assert_int(raft->election_timer_start, ==, 1415); + + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 4); + munit_assert_int(CLUSTER_N_RECV(0, RAFT_IO_APPEND_ENTRIES_RESULT), ==, 4); + munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 4); + munit_assert_int(CLUSTER_N_SEND(1, RAFT_IO_APPEND_ENTRIES_RESULT), ==, 4); + + return MUNIT_OK; +} + +/* If a leader replicates some entries during a given heartbeat interval, it + * skips sending the heartbeat for that interval. */ +TEST(replication, sendSkipHeartbeat, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft; + struct raft_apply req; + CLUSTER_BOOTSTRAP; + CLUSTER_START; + + raft = CLUSTER_RAFT(0); + + /* Server 0 becomes leader and sends the first two heartbeats. */ + CLUSTER_STEP_UNTIL_ELAPSED(1215); + ASSERT_LEADER(0); + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2); + munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 2); + + /* Server 0 starts replicating a new entry after 15 milliseconds. */ + CLUSTER_STEP_UNTIL_ELAPSED(15); + ASSERT_TIME(1230); + CLUSTER_APPLY_ADD_X(0, &req, 1, NULL); + CLUSTER_STEP_N(1); + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 3); + munit_assert_int(raft->leader_state.progress[1].last_send, ==, 1230); + + /* When the heartbeat timeout expires, server 0 does not send an empty + * append entries. */ + CLUSTER_STEP_UNTIL_ELAPSED(70); + ASSERT_TIME(1300); + CLUSTER_STEP_N(1); + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 3); + munit_assert_int(raft->leader_state.progress[1].last_send, ==, 1230); + + return MUNIT_OK; +} + +/* The leader doesn't send replication messages to idle servers. */ +TEST(replication, skipIdle, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_change req1; + struct raft_apply req2; + BOOTSTRAP_START_AND_ELECT; + CLUSTER_ADD(&req1); + CLUSTER_STEP_UNTIL_APPLIED(0, 3, 1000); + CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, &req2, 1, NULL); + CLUSTER_STEP_UNTIL_ELAPSED(1000); + munit_assert_int(CLUSTER_LAST_APPLIED(0), ==, 4); + munit_assert_int(CLUSTER_LAST_APPLIED(1), ==, 4); + munit_assert_int(CLUSTER_LAST_APPLIED(2), ==, 0); + return MUNIT_OK; +} + +/* A follower remains in probe mode until the leader receives a successful + * AppendEntries response. */ +TEST(replication, sendProbe, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply req1; + struct raft_apply req2; + CLUSTER_BOOTSTRAP; + CLUSTER_START; + + /* Server 0 becomes leader and sends the initial heartbeat. */ + CLUSTER_STEP_N(25); + ASSERT_LEADER(0); + ASSERT_TIME(1030); + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1); + + /* Set a very high network latency for server 1, so server 0 will send a + * second probe AppendEntries without transitioning to pipeline mode. */ + munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 0); + CLUSTER_SET_NETWORK_LATENCY(1, 250); + + /* Server 0 receives a new entry after 15 milliseconds. Since the follower + * is still in probe mode and since an AppendEntries message was already + * sent recently, it does not send the new entry immediately. */ + CLUSTER_STEP_UNTIL_ELAPSED(15); + CLUSTER_APPLY_ADD_X(0, &req1, 1, NULL); + CLUSTER_STEP; + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1); + + /* A heartbeat timeout elapses without receiving a response, so server 0 + * sends an new AppendEntries to server 1. */ + CLUSTER_STEP_UNTIL_ELAPSED(85); + CLUSTER_STEP; + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2); + + /* Server 0 receives a second entry after 15 milliseconds. Since the + * follower is still in probe mode and since an AppendEntries message was + * already sent recently, it does not send the new entry immediately. */ + CLUSTER_STEP_UNTIL_ELAPSED(15); + CLUSTER_APPLY_ADD_X(0, &req2, 1, NULL); + CLUSTER_STEP; + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2); + + /* Eventually server 0 receives AppendEntries results for both entries. */ + CLUSTER_STEP_UNTIL_APPLIED(0, 4, 1000); + + return MUNIT_OK; +} + +static bool indices_updated(struct raft_fixture *f, void *data) +{ + (void)f; + const struct raft *r = data; + return r->last_stored == 4 && r->leader_state.progress[1].match_index == 3; +} + +/* A follower transitions to pipeline mode after the leader receives a + * successful AppendEntries response from it. */ +TEST(replication, sendPipeline, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft; + struct raft_apply req1; + struct raft_apply req2; + CLUSTER_BOOTSTRAP; + CLUSTER_START; + + raft = CLUSTER_RAFT(0); + + /* Server 0 becomes leader and sends the initial heartbeat, receiving a + * successful response. */ + CLUSTER_STEP_UNTIL_ELAPSED(1070); + ASSERT_LEADER(0); + ASSERT_TIME(1070); + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1); + + /* Server 0 receives a new entry after 15 milliseconds. Since the follower + * has transitioned to pipeline mode the new entry is sent immediately and + * the next index is optimistically increased. */ + CLUSTER_STEP_UNTIL_ELAPSED(15); + CLUSTER_APPLY_ADD_X(0, &req1, 1, NULL); + CLUSTER_STEP; + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2); + munit_assert_int(raft->leader_state.progress[1].next_index, ==, 4); + + /* After another 15 milliseconds server 0 receives a second apply request, + * which is also sent out immediately */ + CLUSTER_STEP_UNTIL_ELAPSED(15); + CLUSTER_APPLY_ADD_X(0, &req2, 1, NULL); + CLUSTER_STEP; + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 3); + munit_assert_int(raft->leader_state.progress[1].next_index, ==, 5); + + /* Wait until the leader has stored entry 4 and the follower has matched + * entry 3. Expect the commit index to have been updated to 3. */ + CLUSTER_STEP_UNTIL(indices_updated, CLUSTER_RAFT(0), 2000); + munit_assert_ulong(raft->commit_index, ==, 3); + + /* Eventually server 0 receives AppendEntries results for both entries. */ + CLUSTER_STEP_UNTIL_APPLIED(0, 4, 1000); + + return MUNIT_OK; +} + +/* A follower disconnects while in probe mode. */ +TEST(replication, sendDisconnect, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_BOOTSTRAP; + CLUSTER_START; + + /* Server 0 becomes leader and sends the initial heartbeat, however they + * fail because server 1 has disconnected. */ + CLUSTER_STEP_N(24); + ASSERT_LEADER(0); + CLUSTER_DISCONNECT(0, 1); + CLUSTER_STEP; + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 0); + + /* After the heartbeat timeout server 0 retries, but still fails. */ + CLUSTER_STEP_UNTIL_ELAPSED(100); + CLUSTER_STEP; + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 0); + + /* After another heartbeat timeout server 0 retries and this time + * succeeds. */ + CLUSTER_STEP_UNTIL_ELAPSED(100); + CLUSTER_RECONNECT(0, 1); + CLUSTER_STEP; + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1); + + return MUNIT_OK; +} + +/* A follower disconnects while in pipeline mode. */ +TEST(replication, sendDisconnectPipeline, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply req1; + struct raft_apply req2; + CLUSTER_BOOTSTRAP; + CLUSTER_START; + + /* Server 0 becomes leader and sends a couple of heartbeats. */ + CLUSTER_STEP_UNTIL_ELAPSED(1215); + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2); + + /* It then starts to replicate a few entries, however the follower + * disconnects before delivering results. */ + CLUSTER_APPLY_ADD_X(0, &req1, 1, NULL); + CLUSTER_STEP; + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 3); + CLUSTER_APPLY_ADD_X(0, &req2, 1, NULL); + CLUSTER_STEP; + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 4); + + CLUSTER_DISCONNECT(0, 1); + + /* The next heartbeat fails, transitioning the follower back to probe + * mode. */ + CLUSTER_STEP_UNTIL_ELAPSED(115); + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 4); + + /* After reconnection the follower eventually replicates the entries and + * reports back. */ + CLUSTER_RECONNECT(0, 1); + + CLUSTER_STEP_UNTIL_APPLIED(0, 3, 1000); + + return MUNIT_OK; +} + +static char *send_oom_heap_fault_delay[] = {"5", NULL}; +static char *send_oom_heap_fault_repeat[] = {"1", NULL}; + +static MunitParameterEnum send_oom_params[] = { + {TEST_HEAP_FAULT_DELAY, send_oom_heap_fault_delay}, + {TEST_HEAP_FAULT_REPEAT, send_oom_heap_fault_repeat}, + {NULL, NULL}, +}; + +/* Out of memory failures. */ +TEST(replication, sendOom, setUp, tearDown, 0, send_oom_params) +{ + struct fixture *f = data; + return MUNIT_SKIP; + struct raft_apply req; + BOOTSTRAP_START_AND_ELECT; + + HEAP_FAULT_ENABLE; + + CLUSTER_APPLY_ADD_X(0, &req, 1, NULL); + CLUSTER_STEP; + + return MUNIT_OK; +} + +/* A failure occurs upon submitting the I/O request. */ +TEST(replication, persistError, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply req; + BOOTSTRAP_START_AND_ELECT; + + raft_fixture_append_fault(&f->cluster, 0, 0); + + CLUSTER_APPLY_ADD_X(0, &req, 1, NULL); + CLUSTER_STEP; + + return MUNIT_OK; +} + +/* Receive the same entry a second time, before the first has been persisted. */ +TEST(replication, recvTwice, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply *req = munit_malloc(sizeof *req); + BOOTSTRAP_START_AND_ELECT; + + CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req, 1, NULL); + + /* Set a high disk latency for server 1, so server 0 won't receive an + * AppendEntries result within the heartbeat and will re-send the same + * entries */ + CLUSTER_SET_DISK_LATENCY(1, 300); + + CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100); /* First AppendEntries */ + CLUSTER_STEP_UNTIL_ELAPSED(110); /* Heartbeat timeout */ + CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100); /* Second AppendEntries */ + + CLUSTER_STEP_UNTIL_APPLIED(0, req->index, 500); + + free(req); + + return MUNIT_OK; +} + +/* If the term in the request is stale, the server rejects it. */ +TEST(replication, recvStaleTerm, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_GROW; + BOOTSTRAP_START_AND_ELECT; + + /* Set a very high election timeout and the disconnect the leader so it will + * keep sending heartbeats. */ + raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 5000); + raft_set_election_timeout(CLUSTER_RAFT(0), 5000); + CLUSTER_SATURATE_BOTHWAYS(0, 1); + CLUSTER_SATURATE_BOTHWAYS(0, 2); + + /* Eventually a new leader gets elected. */ + CLUSTER_STEP_UNTIL_HAS_NO_LEADER(5000); + CLUSTER_STEP_UNTIL_HAS_LEADER(10000); + munit_assert_int(CLUSTER_LEADER, ==, 1); + + /* Reconnect the old leader to the current follower. */ + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + + /* Step a few times, so the old leader sends heartbeats to the follower, + * which rejects them. */ + CLUSTER_STEP_UNTIL_ELAPSED(200); + + return MUNIT_OK; +} + +/* If server's log is shorter than prevLogIndex, the request is rejected . */ +TEST(replication, recvMissingEntries, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry; + CLUSTER_BOOTSTRAP; + + /* Server 0 has an entry that server 1 doesn't have */ + entry.type = RAFT_COMMAND; + entry.term = 1; + FsmEncodeSetX(1, &entry.buf); + CLUSTER_ADD_ENTRY(0, &entry); + + /* Server 0 wins the election because it has a longer log. */ + CLUSTER_START; + CLUSTER_STEP_UNTIL_HAS_LEADER(5000); + munit_assert_int(CLUSTER_LEADER, ==, 0); + + /* The first server replicates missing entries to the second. */ + CLUSTER_STEP_UNTIL_APPLIED(1, 3, 3000); + + return MUNIT_OK; +} + +/* If the term of the last log entry on the server is different from the one + * prevLogTerm, and value of prevLogIndex is greater than server's commit commit + * index (i.e. this is a normal inconsistency), we reject the request. */ +TEST(replication, recvPrevLogTermMismatch, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry1; + struct raft_entry entry2; + CLUSTER_BOOTSTRAP; + + /* The servers have an entry with a conflicting term. */ + entry1.type = RAFT_COMMAND; + entry1.term = 2; + FsmEncodeSetX(1, &entry1.buf); + CLUSTER_ADD_ENTRY(0, &entry1); + + entry2.type = RAFT_COMMAND; + entry2.term = 1; + FsmEncodeSetX(2, &entry2.buf); + CLUSTER_ADD_ENTRY(1, &entry2); + + CLUSTER_START; + CLUSTER_ELECT(0); + + /* The follower eventually replicates the entry */ + CLUSTER_STEP_UNTIL_APPLIED(1, 2, 3000); + + return MUNIT_OK; +} + +/* The follower has an uncommitted log entry that conflicts with a new one sent + * by the leader (same index but different term). The follower's conflicting log + * entry happens to be a configuration change. In that case the follower + * discards the conflicting entry from its log and rolls back its configuration + * to the initial one contained in the log entry at index 1. */ +TEST(replication, recvRollbackConfigurationToInitial, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry1; + struct raft_entry entry2; + struct raft_configuration base; /* Committed configuration at index 1 */ + struct raft_configuration conf; /* Uncommitted configuration at index 2 */ + CLUSTER_BOOTSTRAP; + CLUSTER_CONFIGURATION(&base); + + /* Both servers have an entry at index 2, but with conflicting terms. The + * entry of the second server is a configuration change. */ + entry1.type = RAFT_COMMAND; + entry1.term = 2; + FsmEncodeSetX(1, &entry1.buf); + CLUSTER_ADD_ENTRY(0, &entry1); + + entry2.type = RAFT_CHANGE; + entry2.term = 1; + CLUSTER_CONFIGURATION(&conf); + raft_configuration_add(&conf, 3, "3", 2); + raft_configuration_encode(&conf, &entry2.buf); + CLUSTER_ADD_ENTRY(1, &entry2); + + /* At startup the second server uses the most recent configuration, i.e. the + * one contained in the entry that we just added. The server can't know yet + * if it's committed or not, and regards it as pending configuration + * change. */ + CLUSTER_START; + ASSERT_CONFIGURATION(1, &conf); + + /* The first server gets elected. */ + CLUSTER_ELECT(0); + + /* The second server eventually replicates the first server's log entry at + * index 2, truncating its own log and rolling back to the configuration + * contained in the log entry at index 1. */ + CLUSTER_STEP_UNTIL_APPLIED(1, 2, 3000); + ASSERT_CONFIGURATION(0, &base); + ASSERT_CONFIGURATION(1, &base); + + raft_configuration_close(&base); + raft_configuration_close(&conf); + + return MUNIT_OK; +} + +/* The follower has an uncommitted log entry that conflicts with a new one sent + * by the leader (same index but different term). The follower's conflicting log + * entry happens to be a configuration change. There's also an older committed + * configuration entry present. In that case the follower discards the + * conflicting entry from its log and rolls back its configuration to the + * committed one in the older configuration entry. */ +TEST(replication, recvRollbackConfigurationToPrevious, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry1; + struct raft_entry entry2; + struct raft_entry entry3; + struct raft_entry entry4; + struct raft_configuration base; /* Committed configuration at index 2 */ + struct raft_configuration conf; /* Uncommitted configuration at index 3 */ + CLUSTER_BOOTSTRAP; + CLUSTER_CONFIGURATION(&base); + + /* Both servers have a matching configuration entry at index 2. */ + CLUSTER_CONFIGURATION(&conf); + + entry1.type = RAFT_CHANGE; + entry1.term = 1; + raft_configuration_encode(&conf, &entry1.buf); + CLUSTER_ADD_ENTRY(0, &entry1); + + entry2.type = RAFT_CHANGE; + entry2.term = 1; + raft_configuration_encode(&conf, &entry2.buf); + CLUSTER_ADD_ENTRY(1, &entry2); + + /* Both servers have an entry at index 3, but with conflicting terms. The + * entry of the second server is a configuration change. */ + entry3.type = RAFT_COMMAND; + entry3.term = 2; + FsmEncodeSetX(1, &entry3.buf); + CLUSTER_ADD_ENTRY(0, &entry3); + + entry4.type = RAFT_CHANGE; + entry4.term = 1; + raft_configuration_add(&conf, 3, "3", 2); + raft_configuration_encode(&conf, &entry4.buf); + CLUSTER_ADD_ENTRY(1, &entry4); + + /* At startup the second server uses the most recent configuration, i.e. the + * one contained in the log entry at index 3. The server can't know yet if + * it's committed or not, and regards it as pending configuration change. */ + CLUSTER_START; + ASSERT_CONFIGURATION(1, &conf); + + /* The first server gets elected. */ + CLUSTER_ELECT(0); + + /* The second server eventually replicates the first server's log entry at + * index 3, truncating its own log and rolling back to the configuration + * contained in the log entry at index 2. */ + CLUSTER_STEP_UNTIL_APPLIED(1, 3, 3000); + ASSERT_CONFIGURATION(0, &base); + ASSERT_CONFIGURATION(1, &base); + + raft_configuration_close(&base); + raft_configuration_close(&conf); + + return MUNIT_OK; +} + +/* The follower has an uncommitted log entry that conflicts with a new one sent + * by the leader (same index but different term). The follower's conflicting log + * entry happens to be a configuration change. The follower's log has been + * truncated after a snashot and does not contain the previous committed + * configuration anymore. In that case the follower discards the conflicting + * entry from its log and rolls back its configuration to the previous committed + * one, which was cached when the snapshot was restored. */ +TEST(replication, recvRollbackConfigurationToSnapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry1; + struct raft_entry entry2; + struct raft_configuration base; /* Committed configuration at index 1 */ + struct raft_configuration conf; /* Uncommitted configuration at index 2 */ + int rv; + + CLUSTER_CONFIGURATION(&conf); + CLUSTER_CONFIGURATION(&base); + + /* Bootstrap the first server. This creates a log entry at index 1 + * containing the initial configuration. */ + rv = raft_bootstrap(CLUSTER_RAFT(0), &conf); + munit_assert_int(rv, ==, 0); + + /* The second server has a snapshot up to entry 1. Entry 1 is not present in + * the log. */ + CLUSTER_SET_SNAPSHOT(1 /* */, + 1 /* last index */, + 1 /* last term */, + 1 /* conf index */, + 5 /* x */, + 0 /* y */); + CLUSTER_SET_TERM(1, 1); + + /* Both servers have an entry at index 2, but with conflicting terms. The + * entry of the second server is a configuration change and gets appended to + * the truncated log. */ + entry1.type = RAFT_COMMAND; + entry1.term = 3; + FsmEncodeSetX(1, &entry1.buf); + CLUSTER_ADD_ENTRY(0, &entry1); + + entry2.type = RAFT_CHANGE; + entry2.term = 2; + raft_configuration_add(&conf, 3, "3", 2); + raft_configuration_encode(&conf, &entry2.buf); + CLUSTER_ADD_ENTRY(1, &entry2); + + /* At startup the second server uses the most recent configuration, i.e. the + * one contained in the log entry at index 2. The server can't know yet if + * it's committed or not, and regards it as pending configuration change. */ + CLUSTER_START; + ASSERT_CONFIGURATION(1, &conf); + + CLUSTER_ELECT(0); + + /* The second server eventually replicates the first server's log entry at + * index 3, truncating its own log and rolling back to the configuration + * contained in the snapshot, which is not present in the log anymore but + * was cached at startup. */ + CLUSTER_STEP_UNTIL_APPLIED(1, 3, 3000); + ASSERT_CONFIGURATION(0, &base); + ASSERT_CONFIGURATION(1, &base); + + raft_configuration_close(&base); + raft_configuration_close(&conf); + + return MUNIT_OK; +} + +/* If any of the new entry has the same index of an existing entry in our log, + * but different term, and that entry index is already committed, we bail out + * with an error. */ +TEST(replication, recvPrevIndexConflict, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry1; + struct raft_entry entry2; + CLUSTER_BOOTSTRAP; + + /* The servers have an entry with a conflicting term. */ + entry1.type = RAFT_COMMAND; + entry1.term = 2; + FsmEncodeSetX(1, &entry1.buf); + CLUSTER_ADD_ENTRY(0, &entry1); + + entry2.type = RAFT_COMMAND; + entry2.term = 1; + FsmEncodeSetX(2, &entry2.buf); + CLUSTER_ADD_ENTRY(1, &entry2); + + CLUSTER_START; + CLUSTER_ELECT(0); + + /* Artificially bump the commit index on the second server */ + CLUSTER_RAFT(1)->commit_index = 2; + CLUSTER_STEP; + CLUSTER_STEP; + + return MUNIT_OK; +} + +/* A write log request is submitted for outstanding log entries. If some entries + * are already existing in the log, they will be skipped. */ +TEST(replication, recvSkip, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply *req = munit_malloc(sizeof *req); + BOOTSTRAP_START_AND_ELECT; + + /* Submit an entry */ + CLUSTER_APPLY_ADD_X(0, req, 1, NULL); + + /* The leader replicates the entry to the follower however it does not get + * notified about the result, so it sends the entry again. */ + CLUSTER_STEP; + CLUSTER_SATURATE_BOTHWAYS(0, 1); + CLUSTER_STEP_UNTIL_ELAPSED(150); + + /* The follower reconnects and receives again the same entry. This time the + * leader receives the notification. */ + CLUSTER_DESATURATE_BOTHWAYS(0, 1); + CLUSTER_STEP_UNTIL_APPLIED(0, req->index, 2000); + + free(req); + + return MUNIT_OK; +} + +/* If the index and term of the last snapshot on the server match prevLogIndex + * and prevLogTerm the request is accepted. */ +TEST(replication, recvMatch_last_snapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry; + struct raft_configuration configuration; + int rv; + + CLUSTER_CONFIGURATION(&configuration); + rv = raft_bootstrap(CLUSTER_RAFT(0), &configuration); + munit_assert_int(rv, ==, 0); + raft_configuration_close(&configuration); + + /* The first server has entry 2 */ + entry.type = RAFT_COMMAND; + entry.term = 2; + FsmEncodeSetX(5, &entry.buf); + CLUSTER_ADD_ENTRY(0, &entry); + + /* The second server has a snapshot up to entry 2 */ + CLUSTER_SET_SNAPSHOT(1 /* */, + 2 /* last index */, + 2 /* last term */, + 1 /* conf index */, + 5 /* x */, + 0 /* y */); + CLUSTER_SET_TERM(1, 2); + + CLUSTER_START; + CLUSTER_ELECT(0); + + /* Apply an additional entry and check that it gets replicated on the + * follower. */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_STEP_UNTIL_APPLIED(1, 3, 3000); + + return MUNIT_OK; +} + +/* If a candidate server receives a request containing the same term as its + * own, it it steps down to follower and accept the request . */ +TEST(replication, recvCandidateSameTerm, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_GROW; + CLUSTER_BOOTSTRAP; + + /* Disconnect server 2 from the other two and set a low election timeout on + * it, so it will immediately start an election. */ + CLUSTER_SATURATE_BOTHWAYS(2, 0); + CLUSTER_SATURATE_BOTHWAYS(2, 1); + raft_fixture_set_randomized_election_timeout(&f->cluster, 2, 800); + raft_set_election_timeout(CLUSTER_RAFT(2), 800); + + /* Server 2 becomes candidate. */ + CLUSTER_START; + CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_CANDIDATE, 1000); + munit_assert_int(CLUSTER_TERM(2), ==, 2); + + /* Server 0 wins the election and replicates an entry. */ + CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_LEADER, 2000); + munit_assert_int(CLUSTER_TERM(0), ==, 2); + munit_assert_int(CLUSTER_TERM(1), ==, 2); + munit_assert_int(CLUSTER_TERM(2), ==, 2); + CLUSTER_MAKE_PROGRESS; + + /* Now reconnect the third server, which eventually steps down and + * replicates the entry. */ + munit_assert_int(CLUSTER_STATE(2), ==, RAFT_CANDIDATE); + munit_assert_int(CLUSTER_TERM(2), ==, 2); + CLUSTER_DESATURATE_BOTHWAYS(2, 0); + CLUSTER_DESATURATE_BOTHWAYS(2, 1); + CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_FOLLOWER, 2000); + CLUSTER_STEP_UNTIL_APPLIED(2, 2, 2000); + + return MUNIT_OK; +} + +/* If a candidate server receives a request containing an higher term as its + * own, it it steps down to follower and accept the request . */ +TEST(replication, recvCandidateHigherTerm, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_GROW; + CLUSTER_BOOTSTRAP; + + /* Set a high election timeout on server 1, so it won't become candidate */ + raft_fixture_set_randomized_election_timeout(&f->cluster, 1, 2000); + raft_set_election_timeout(CLUSTER_RAFT(1), 2000); + + /* Disconnect server 2 from the other two. */ + CLUSTER_SATURATE_BOTHWAYS(2, 0); + CLUSTER_SATURATE_BOTHWAYS(2, 1); + + /* Set a low election timeout on server 0, and disconnect it from server 1, + * so by the time it wins the second round, server 2 will have turned + * candidate */ + raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 800); + raft_set_election_timeout(CLUSTER_RAFT(0), 800); + CLUSTER_SATURATE_BOTHWAYS(0, 1); + + CLUSTER_START; + + /* Server 2 becomes candidate, and server 0 already is candidate. */ + CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_CANDIDATE, 1500); + munit_assert_int(CLUSTER_TERM(2), ==, 2); + munit_assert_int(CLUSTER_STATE(0), ==, RAFT_CANDIDATE); + munit_assert_int(CLUSTER_TERM(0), ==, 2); + + /* Server 0 starts a new election, while server 2 is still candidate */ + CLUSTER_STEP_UNTIL_TERM_IS(0, 3, 2000); + munit_assert_int(CLUSTER_TERM(2), ==, 2); + munit_assert_int(CLUSTER_STATE(2), ==, RAFT_CANDIDATE); + + /* Reconnect the first and second server and let the election succeed and + * replicate an entry. */ + CLUSTER_DESATURATE_BOTHWAYS(0, 1); + CLUSTER_STEP_UNTIL_HAS_LEADER(1000); + CLUSTER_MAKE_PROGRESS; + + /* Now reconnect the third server, which eventually steps down and + * replicates the entry. */ + munit_assert_int(CLUSTER_STATE(2), ==, RAFT_CANDIDATE); + munit_assert_int(CLUSTER_TERM(2), ==, 2); + CLUSTER_DESATURATE_BOTHWAYS(2, 0); + CLUSTER_DESATURATE_BOTHWAYS(2, 1); + CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_FOLLOWER, 2000); + CLUSTER_STEP_UNTIL_APPLIED(2, 2, 2000); + + return MUNIT_OK; +} + +/* If the server handling the response is not the leader, the result + * is ignored. */ +TEST(replication, resultNotLeader, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + BOOTSTRAP_START_AND_ELECT; + + /* Set a very high-latency for the second server's outgoing messages, so the + * first server won't get notified about the results for a while. */ + CLUSTER_SET_NETWORK_LATENCY(1, 400); + + /* Set a low election timeout on the first server so it will step down very + * soon. */ + raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 200); + raft_set_election_timeout(CLUSTER_RAFT(0), 200); + + /* Eventually leader steps down and becomes candidate. */ + CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE, 2000); + + /* The AppendEntries result eventually gets delivered, but the candidate + * ignores it. */ + CLUSTER_STEP_UNTIL_ELAPSED(400); + + return MUNIT_OK; +} + +/* If the response has a term which is lower than the server's one, it's + * ignored. */ +TEST(replication, resultLowerTerm, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_GROW; + BOOTSTRAP_START_AND_ELECT; + + /* Set a very high-latency for the second server's outgoing messages, so the + * first server won't get notified about the results for a while. */ + CLUSTER_SET_NETWORK_LATENCY(1, 2000); + + /* Set a high election timeout on server 1, so it won't become candidate */ + raft_fixture_set_randomized_election_timeout(&f->cluster, 1, 2000); + raft_set_election_timeout(CLUSTER_RAFT(1), 2000); + + /* Disconnect server 0 and set a low election timeout on it so it will step + * down very soon. */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 200); + raft_set_election_timeout(CLUSTER_RAFT(0), 200); + CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 2000); + + /* Make server 0 become leader again. */ + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_LEADER, 4000); + + /* Eventually deliver the result message. */ + CLUSTER_STEP_UNTIL_ELAPSED(2500); + + return MUNIT_OK; +} + +/* If the response has a term which is higher than the server's one, step down + * to follower. */ +TEST(replication, resultHigherTerm, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_GROW; + BOOTSTRAP_START_AND_ELECT; + + /* Set a very high election timeout for server 0 so it won't step down. */ + raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 5000); + raft_set_election_timeout(CLUSTER_RAFT(0), 5000); + + /* Disconnect the server 0 from the rest of the cluster. */ + CLUSTER_SATURATE_BOTHWAYS(0, 1); + CLUSTER_SATURATE_BOTHWAYS(0, 2); + + /* Eventually a new leader gets elected */ + CLUSTER_STEP_UNTIL_HAS_NO_LEADER(2000); + CLUSTER_STEP_UNTIL_HAS_LEADER(4000); + munit_assert_int(CLUSTER_LEADER, ==, 1); + + /* Reconnect the old leader to the current follower, which eventually + * replies with an AppendEntries result containing an higher term. */ + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 2000); + + return MUNIT_OK; +} + +/* If the response fails because a log mismatch, the nextIndex for the server is + * updated and the relevant older entries are resent. */ +TEST(replication, resultRetry, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry; + CLUSTER_BOOTSTRAP; + + /* Add an additional entry to the first server that the second server does + * not have. */ + entry.type = RAFT_COMMAND; + entry.term = 1; + FsmEncodeSetX(5, &entry.buf); + CLUSTER_ADD_ENTRY(0, &entry); + + CLUSTER_START; + CLUSTER_ELECT(0); + + /* The first server receives an AppendEntries result from the second server + * indicating that its log does not have the entry at index 2, so it will + * resend it. */ + CLUSTER_STEP_UNTIL_APPLIED(1, 3, 2000); + + return MUNIT_OK; +} + +static void applyAssertStatusCb(struct raft_apply *req, + int status, + void *result) +{ + (void)result; + int status_expected = (int)(intptr_t)(req->data); + munit_assert_int(status_expected, ==, status); +} + +/* When the leader fails to write some new entries to disk, it steps down. */ +TEST(replication, diskWriteFailure, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply *req = munit_malloc(sizeof(*req)); + req->data = (void *)(intptr_t)RAFT_IOERR; + BOOTSTRAP_START_AND_ELECT; + + raft_fixture_append_fault(&f->cluster, 0, 0); + CLUSTER_APPLY_ADD_X(0, req, 1, applyAssertStatusCb); + /* The leader steps down when its disk write fails. */ + CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 2000); + free(req); + + return MUNIT_OK; +} + +/* A follower updates its term number while persisting entries. */ +TEST(replication, newTermWhileAppending, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply *req = munit_malloc(sizeof(*req)); + raft_term term; + CLUSTER_GROW; + + /* Make sure that persisting entries will take a long time */ + CLUSTER_SET_DISK_LATENCY(2, 3000); + + BOOTSTRAP_START_AND_ELECT; + CLUSTER_APPLY_ADD_X(0, req, 1, NULL); + + /* Wait for the leader to replicate the entry */ + CLUSTER_STEP_UNTIL_ELAPSED(500); + + /* Force a new term */ + term = CLUSTER_RAFT(2)->current_term; + CLUSTER_DEPOSE; + CLUSTER_ELECT(1); + + CLUSTER_STEP_UNTIL_ELAPSED(500); + munit_assert_ullong(CLUSTER_RAFT(2)->current_term, ==, term + 1); + + /* Wait for the long disk write to complete */ + CLUSTER_STEP_UNTIL_ELAPSED(3000); + + free(req); + + return MUNIT_OK; +} + +/* A leader with slow disk commits an entry that it hasn't persisted yet, + * because enough followers to have a majority have aknowledged that they have + * appended the entry. The leader's last_stored field hence lags behind its + * commit_index. A new leader gets elected, with a higher commit index and sends + * first a new entry than a heartbeat to the old leader, that needs to update + * its commit_index taking into account its lagging last_stored. */ +TEST(replication, lastStoredLaggingBehindCommitIndex, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_GROW; + + /* Server 0 takes a long time to persist entry 2 (the barrier) */ + CLUSTER_SET_DISK_LATENCY(0, 10000); + + /* Server 0 gets elected and creates a barrier entry at index 2 */ + BOOTSTRAP_START_AND_ELECT; + + /* Server 0 commits and applies barrier entry 2 even if it not persist it + * yet. */ + CLUSTER_STEP_UNTIL_APPLIED(0, 2, 2000); + + munit_assert_int(CLUSTER_RAFT(0)->last_stored, ==, 1); + munit_assert_int(CLUSTER_RAFT(0)->commit_index, ==, 2); + munit_assert_int(CLUSTER_RAFT(0)->last_applied, ==, 2); + + /* Server 1 stored barrier entry 2, but did not yet receive a notification + * from server 0 about the new commit index. */ + munit_assert_int(CLUSTER_RAFT(1)->last_stored, ==, 2); + munit_assert_int(CLUSTER_RAFT(1)->commit_index, ==, 1); + munit_assert_int(CLUSTER_RAFT(1)->last_applied, ==, 1); + + /* Disconnect server 0 from server 1 and 2. */ + CLUSTER_DISCONNECT(0, 1); + CLUSTER_DISCONNECT(0, 2); + + /* Set a very high election timeout on server 0, so it won't step down for a + * while, even if disconnected. */ + raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 10000); + raft_set_election_timeout(CLUSTER_RAFT(0), 10000); + + /* Server 1 and 2 eventually timeout and start an election, server 1 + * wins. */ + CLUSTER_STEP_UNTIL_HAS_NO_LEADER(4000); + CLUSTER_STEP_UNTIL_HAS_LEADER(2000); + munit_assert_int(CLUSTER_LEADER, ==, 1); + + /* Server 1 commits the barrier entry at index 3 that it created at the + * start of its term. */ + CLUSTER_STEP_UNTIL_APPLIED(1, 3, 2000); + + /* Reconnect server 0 to server 1, which will start replicating entry 3 to + * it. */ + CLUSTER_RECONNECT(0, 1); + CLUSTER_STEP_UNTIL_APPLIED(0, 3, 20000); + + return MUNIT_OK; +} + +/* A leader with faulty disk fails to persist the barrier entry upon election. + */ +TEST(replication, failPersistBarrier, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_GROW; + + /* Server 0 will fail to persist entry 2, a barrier */ + raft_fixture_append_fault(&f->cluster, 0, 0); + + /* Server 0 gets elected and creates a barrier entry at index 2 */ + CLUSTER_BOOTSTRAP; + CLUSTER_START; + CLUSTER_START_ELECT(0); + + /* Cluster recovers. */ + CLUSTER_STEP_UNTIL_HAS_LEADER(20000); + + return MUNIT_OK; +} + +/* All servers fail to persist the barrier entry upon election of the first + * leader. Ensure the cluster is able to make progress afterwards. + */ +TEST(replication, failPersistBarrierFollower, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_GROW; + + /* The servers will fail to persist entry 2, a barrier */ + raft_fixture_append_fault(&f->cluster, 1, 0); + raft_fixture_append_fault(&f->cluster, 2, 0); + + /* Server 0 gets elected and creates a barrier entry at index 2 */ + CLUSTER_BOOTSTRAP; + CLUSTER_START; + CLUSTER_START_ELECT(0); + + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + return MUNIT_OK; +} + +/* A leader originates a log entry, fails to persist it, and steps down. + * A follower that received the entry wins the ensuing election and sends + * the same entry back to the original leader, while the original leader + * still has an outgoing pending message that references its copy of the + * entry. This triggers the original leader to reinstate the entry in its + * log. */ +TEST(replication, receiveSameWithPendingSend, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply req; + + /* Three voters. */ + CLUSTER_GROW; + /* Server 0 is the leader. */ + BOOTSTRAP_START_AND_ELECT; + + /* Server 1 never gets the entry. */ + raft_fixture_set_send_latency(&f->cluster, 0, 1, 10000); + + /* Disk write fails, but not before the entry gets to server 2. */ + CLUSTER_SET_DISK_LATENCY(0, 1000); + raft_fixture_append_fault(&f->cluster, 0, 0); + req.data = (void *)(intptr_t)RAFT_IOERR; + CLUSTER_APPLY_ADD_X(0, &req, 1, NULL); + /* Server 0 steps down. */ + CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 1500); + munit_assert_ullong(CLUSTER_RAFT(0)->current_term, ==, 2); + ASSERT_FOLLOWER(1); + ASSERT_FOLLOWER(2); + /* Only server 2 has the new entry. */ + munit_assert_ullong(CLUSTER_RAFT(0)->last_stored, ==, 2); + munit_assert_ullong(CLUSTER_RAFT(1)->last_stored, ==, 2); + munit_assert_ullong(CLUSTER_RAFT(2)->last_stored, ==, 3); + + /* Server 2 times out first and wins the election. */ + raft_set_election_timeout(CLUSTER_RAFT(2), 500); + raft_fixture_start_elect(&f->cluster, 2); + CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_LEADER, 1000); + munit_assert_ullong(CLUSTER_RAFT(2)->current_term, ==, 3); + + /* Server 0 gets the same entry back from server 2. */ + CLUSTER_STEP_UNTIL_APPLIED(2, 3, 1000); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_snapshot.c b/test/raft/integration/test_snapshot.c new file mode 100644 index 000000000..e75d27ba5 --- /dev/null +++ b/test/raft/integration/test_snapshot.c @@ -0,0 +1,860 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(3); + CLUSTER_BOOTSTRAP; + CLUSTER_START; + CLUSTER_ELECT(0); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +/* Set the snapshot threshold on all servers of the cluster */ +#define SET_SNAPSHOT_THRESHOLD(VALUE) \ + { \ + unsigned i; \ + for (i = 0; i < CLUSTER_N; i++) { \ + raft_set_snapshot_threshold(CLUSTER_RAFT(i), VALUE); \ + } \ + } + +/* Set the snapshot trailing logs number on all servers of the cluster */ +#define SET_SNAPSHOT_TRAILING(VALUE) \ + { \ + unsigned i; \ + for (i = 0; i < CLUSTER_N; i++) { \ + raft_set_snapshot_trailing(CLUSTER_RAFT(i), VALUE); \ + } \ + } + +/* Set the snapshot timeout on all servers of the cluster */ +#define SET_SNAPSHOT_TIMEOUT(VALUE) \ + { \ + unsigned i; \ + for (i = 0; i < CLUSTER_N; i++) { \ + raft_set_install_snapshot_timeout(CLUSTER_RAFT(i), VALUE); \ + } \ + } + +static int ioMethodSnapshotPutFail(struct raft_io *raft_io, + unsigned trailing, + struct raft_io_snapshot_put *req, + const struct raft_snapshot *snapshot, + raft_io_snapshot_put_cb cb) +{ + (void)raft_io; + (void)trailing; + (void)req; + (void)snapshot; + (void)cb; + return -1; +} + +#define SET_FAULTY_SNAPSHOT_PUT() \ + { \ + unsigned i; \ + for (i = 0; i < CLUSTER_N; i++) { \ + CLUSTER_RAFT(i)->io->snapshot_put = ioMethodSnapshotPutFail; \ + } \ + } + +static int ioMethodAsyncWorkFail(struct raft_io *raft_io, + struct raft_io_async_work *req, + raft_io_async_work_cb cb) +{ + (void)raft_io; + (void)req; + (void)cb; + return -1; +} + +#define SET_FAULTY_ASYNC_WORK() \ + { \ + unsigned i; \ + for (i = 0; i < CLUSTER_N; i++) { \ + CLUSTER_RAFT(i)->io->async_work = ioMethodAsyncWorkFail; \ + } \ + } + +static int fsmSnapshotFail(struct raft_fsm *fsm, + struct raft_buffer *bufs[], + unsigned *n_bufs) +{ + (void)fsm; + (void)bufs; + (void)n_bufs; + return -1; +} + +#define SET_FAULTY_SNAPSHOT_ASYNC() \ + { \ + unsigned i; \ + for (i = 0; i < CLUSTER_N; i++) { \ + CLUSTER_RAFT(i)->fsm->snapshot_async = fsmSnapshotFail; \ + } \ + } + +#define RESET_FSM_ASYNC(I) \ + { \ + struct raft_fsm *fsm = CLUSTER_RAFT(I)->fsm; \ + FsmClose(fsm); \ + FsmInitAsync(fsm, fsm->version); \ + } + +#define SET_FAULTY_SNAPSHOT() \ + { \ + unsigned i; \ + for (i = 0; i < CLUSTER_N; i++) { \ + CLUSTER_RAFT(i)->fsm->snapshot = fsmSnapshotFail; \ + } \ + } + +/****************************************************************************** + * + * Successfully install a snapshot + * + *****************************************************************************/ + +SUITE(snapshot) + +/* Install a snapshot on a follower that has fallen behind. */ +TEST(snapshot, installOne, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + CLUSTER_SATURATE_BOTHWAYS(0, 2); + + /* Apply a few of entries, to force a snapshot to be taken. */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Reconnect the follower and wait for it to catch up */ + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000); + + /* Check that the leader has sent a snapshot */ + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + return MUNIT_OK; +} + +/* Install snapshot times out and leader retries */ +TEST(snapshot, installOneTimeOut, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + SET_SNAPSHOT_TIMEOUT(200); + + /* Apply a few of entries, to force a snapshot to be taken. Drop all network + * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be + * replicated */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Reconnect both servers and set a high disk latency on server 2 so that + * the InstallSnapshot RPC will time out */ + CLUSTER_SET_DISK_LATENCY(2, 300); + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + + /* Wait a while and check that the leader has sent a snapshot */ + CLUSTER_STEP_UNTIL_ELAPSED(300); + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + + /* Wait for the snapshot to be installed */ + CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000); + + /* Assert that the leader has retried the InstallSnapshot RPC */ + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 2); + munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 2); + + return MUNIT_OK; +} + +/* Install snapshot to an offline node */ +TEST(snapshot, + installOneDisconnectedFromBeginningReconnects, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + SET_SNAPSHOT_TIMEOUT(200); + + /* Apply a few of entries, to force a snapshot to be taken. Disconnect + * servers 0 and 2 so that the network calls return failure status */ + CLUSTER_DISCONNECT(0, 2); + CLUSTER_DISCONNECT(2, 0); + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Wait a while so leader detects offline node */ + CLUSTER_STEP_UNTIL_ELAPSED(2000); + + /* Assert that the leader doesn't try sending a snapshot to an offline node + */ + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); + munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); + + CLUSTER_RECONNECT(0, 2); + CLUSTER_RECONNECT(2, 0); + /* Wait for the snapshot to be installed */ + CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000); + + /* Assert that the leader has sent an InstallSnapshot RPC */ + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + + return MUNIT_OK; +} + +/* Install snapshot to an offline node that went down during operation */ +TEST(snapshot, + installOneDisconnectedDuringOperationReconnects, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + SET_SNAPSHOT_TIMEOUT(200); + + /* Apply a few of entries */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Wait for follower to catch up*/ + CLUSTER_STEP_UNTIL_APPLIED(2, 5, 5000); + /* Assert that the leader hasn't sent an InstallSnapshot RPC */ + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); + + CLUSTER_DISCONNECT(0, 2); + CLUSTER_DISCONNECT(2, 0); + + /* Wait a while so leader detects offline node */ + CLUSTER_STEP_UNTIL_ELAPSED(2000); + + /* Apply a few more entries */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Assert that the leader doesn't try sending snapshot to an offline node */ + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); + munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); + + CLUSTER_RECONNECT(0, 2); + CLUSTER_RECONNECT(2, 0); + CLUSTER_STEP_UNTIL_APPLIED(2, 8, 5000); + + /* Assert that the leader has tried sending an InstallSnapshot RPC */ + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + + return MUNIT_OK; +} + +/* No snapshots sent to killed nodes */ +TEST(snapshot, noSnapshotInstallToKilled, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + SET_SNAPSHOT_TIMEOUT(200); + + /* Kill a server */ + CLUSTER_KILL(2); + + /* Apply a few of entries */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Wait a while */ + CLUSTER_STEP_UNTIL_ELAPSED(4000); + + /* Assert that the leader hasn't sent an InstallSnapshot RPC */ + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); + return MUNIT_OK; +} + +/* Install snapshot times out and leader retries, afterwards AppendEntries + * resume */ +TEST(snapshot, installOneTimeOutAppendAfter, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + SET_SNAPSHOT_TIMEOUT(200); + + /* Apply a few of entries, to force a snapshot to be taken. Drop all network + * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be + * replicated */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Reconnect both servers and set a high disk latency on server 2 so that + * the InstallSnapshot RPC will time out */ + CLUSTER_SET_DISK_LATENCY(2, 300); + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + + /* Wait for the snapshot to be installed */ + CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000); + + /* Append a few entries and check if they are replicated */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_STEP_UNTIL_APPLIED(2, 5, 5000); + + /* Assert that the leader has retried the InstallSnapshot RPC */ + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 2); + + return MUNIT_OK; +} + +/* Install 2 snapshots that both time out and assure the follower catches up */ +TEST(snapshot, installMultipleTimeOut, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + SET_SNAPSHOT_TIMEOUT(200); + + /* Apply a few of entries, to force a snapshot to be taken. Drop all network + * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be + * replicated */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Reconnect both servers and set a high disk latency on server 2 so that + * the InstallSnapshot RPC will time out */ + CLUSTER_SET_DISK_LATENCY(2, 300); + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + + /* Step until the snapshot times out */ + CLUSTER_STEP_UNTIL_ELAPSED(400); + + /* Apply another few of entries, to force a new snapshot to be taken. Drop + * all traffic between servers 0 and 2 in order for AppendEntries RPCs to + * not be replicated */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Reconnect the follower */ + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + CLUSTER_STEP_UNTIL_APPLIED(2, 7, 5000); + + /* Assert that the leader has sent multiple InstallSnapshot RPCs */ + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), >=, 2); + + return MUNIT_OK; +} + +/* Install 2 snapshots that both time out, launch a few regular AppendEntries + * and assure the follower catches up */ +TEST(snapshot, installMultipleTimeOutAppendAfter, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + SET_SNAPSHOT_TIMEOUT(200); + + /* Apply a few of entries, to force a snapshot to be taken. Drop all network + * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be + * replicated */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Reconnect both servers and set a high disk latency on server 2 so that + * the InstallSnapshot RPC will time out */ + CLUSTER_SET_DISK_LATENCY(2, 300); + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + + /* Step until the snapshot times out */ + CLUSTER_STEP_UNTIL_ELAPSED(400); + + /* Apply another few of entries, to force a new snapshot to be taken. Drop + * all traffic between servers 0 and 2 in order for AppendEntries RPCs to + * not be replicated */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Reconnect the follower */ + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + /* Append a few entries and make sure the follower catches up */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_STEP_UNTIL_APPLIED(2, 9, 5000); + + /* Assert that the leader has sent multiple InstallSnapshot RPCs */ + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), >=, 2); + + return MUNIT_OK; +} + +static bool server_installing_snapshot(struct raft_fixture *f, void *data) +{ + (void)f; + const struct raft *r = data; + return r->snapshot.put.data != NULL && r->last_stored == 0; +} + +static bool server_taking_snapshot(struct raft_fixture *f, void *data) +{ + (void)f; + const struct raft *r = data; + return r->snapshot.put.data != NULL && r->last_stored != 0; +} + +static bool server_snapshot_done(struct raft_fixture *f, void *data) +{ + (void)f; + const struct raft *r = data; + return r->snapshot.put.data == NULL; +} + +/* Follower receives HeartBeats during the installation of a snapshot */ +TEST(snapshot, installSnapshotHeartBeats, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + CLUSTER_SATURATE_BOTHWAYS(0, 1); + + /* Apply a few of entries, to force a snapshot to be taken. */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Set a large disk latency on the follower, this will allow some + * heartbeats to be sent during the snapshot installation */ + CLUSTER_SET_DISK_LATENCY(1, 2000); + + munit_assert_uint(CLUSTER_N_RECV(1, RAFT_IO_INSTALL_SNAPSHOT), ==, 0); + + /* Step the cluster until server 1 installs a snapshot */ + const struct raft *r = CLUSTER_RAFT(1); + CLUSTER_DESATURATE_BOTHWAYS(0, 1); + CLUSTER_STEP_UNTIL(server_installing_snapshot, (void *)r, 2000); + munit_assert_uint(CLUSTER_N_RECV(1, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + + /* Count the number of AppendEntries RPCs received during the snapshot + * install*/ + unsigned before = CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES); + CLUSTER_STEP_UNTIL(server_snapshot_done, (void *)r, 5000); + unsigned after = CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES); + munit_assert_uint(before, <, after); + + /* Check that the InstallSnapshot RPC was not resent */ + munit_assert_uint(CLUSTER_N_RECV(1, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + + /* Check that the snapshot was applied and we can still make progress */ + CLUSTER_STEP_UNTIL_APPLIED(1, 4, 5000); + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_STEP_UNTIL_APPLIED(1, 6, 5000); + + return MUNIT_OK; +} + +/* InstallSnapshot RPC arrives while persisting Entries */ +TEST(snapshot, installSnapshotDuringEntriesWrite, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set a large disk latency on the follower, this will allow a + * InstallSnapshot RPC to arrive while the entries are being persisted. */ + CLUSTER_SET_DISK_LATENCY(1, 2000); + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + + /* Replicate some entries, these will take a while to persist */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Make sure leader can't succesfully send any more entries */ + CLUSTER_DISCONNECT(0, 1); + CLUSTER_MAKE_PROGRESS; /* Snapshot taken here */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; /* Snapshot taken here */ + CLUSTER_MAKE_PROGRESS; + + /* Snapshot with index 6 is sent while follower is still writing the entries + * to disk that arrived before the disconnect. */ + CLUSTER_RECONNECT(0, 1); + + /* Make sure follower is up to date */ + CLUSTER_STEP_UNTIL_APPLIED(1, 7, 5000); + return MUNIT_OK; +} + +static char *fsm_version[] = {"1", "2", "3", NULL}; +static char *fsm_snapshot_async[] = {"0", "1", NULL}; +static MunitParameterEnum fsm_snapshot_async_params[] = { + {CLUSTER_SS_ASYNC_PARAM, fsm_snapshot_async}, + {CLUSTER_FSM_VERSION_PARAM, fsm_version}, + {NULL, NULL}, +}; + +static char *fsm_snapshot_only_async[] = {"1", NULL}; +static char *fsm_version_only_async[] = {"3", NULL}; +static MunitParameterEnum fsm_snapshot_only_async_params[] = { + {CLUSTER_SS_ASYNC_PARAM, fsm_snapshot_only_async}, + {CLUSTER_FSM_VERSION_PARAM, fsm_version_only_async}, + {NULL, NULL}, +}; + +/* Follower receives AppendEntries RPCs while taking a snapshot */ +TEST(snapshot, + takeSnapshotAppendEntries, + setUp, + tearDown, + 0, + fsm_snapshot_async_params) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + + /* Set a large disk latency on the follower, this will allow AppendEntries + * to be sent while a snapshot is taken */ + CLUSTER_SET_DISK_LATENCY(1, 2000); + + /* Apply a few of entries, to force a snapshot to be taken. */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Step the cluster until server 1 takes a snapshot */ + const struct raft *r = CLUSTER_RAFT(1); + CLUSTER_STEP_UNTIL(server_taking_snapshot, (void *)r, 3000); + + /* Send AppendEntries RPCs while server 1 is taking a snapshot */ + static struct raft_apply reqs[5]; + for (int i = 0; i < 5; i++) { + CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, &reqs[i], 1, NULL); + } + CLUSTER_STEP_UNTIL(server_snapshot_done, (void *)r, 5000); + + /* Make sure the AppendEntries are applied and we can make progress */ + CLUSTER_STEP_UNTIL_APPLIED(1, 9, 5000); + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_STEP_UNTIL_APPLIED(1, 11, 5000); + return MUNIT_OK; +} + +TEST(snapshot, + takeSnapshotSnapshotPutFail, + setUp, + tearDown, + 0, + fsm_snapshot_async_params) +{ + struct fixture *f = data; + (void)params; + + SET_FAULTY_SNAPSHOT_PUT(); + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + + /* Apply a few of entries, to force a snapshot to be taken. */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* No crash or leaks have occurred */ + return MUNIT_OK; +} + +TEST(snapshot, + takeSnapshotAsyncWorkFail, + setUp, + tearDown, + 0, + fsm_snapshot_async_params) +{ + struct fixture *f = data; + (void)params; + + SET_FAULTY_ASYNC_WORK(); + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + + /* Apply a few of entries, to force a snapshot to be taken. */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* No crash or leaks have occurred */ + return MUNIT_OK; +} + +TEST(snapshot, + takeSnapshotAsyncFail, + setUp, + tearDown, + 0, + fsm_snapshot_only_async_params) +{ + struct fixture *f = data; + (void)params; + + SET_FAULTY_SNAPSHOT_ASYNC(); + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + + /* Apply a few of entries, to force a snapshot to be taken. */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* No crash or leaks have occurred */ + return MUNIT_OK; +} + +TEST(snapshot, + takeSnapshotAsyncFailOnce, + setUp, + tearDown, + 0, + fsm_snapshot_only_async_params) +{ + struct fixture *f = data; + (void)params; + + SET_FAULTY_SNAPSHOT_ASYNC(); + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + CLUSTER_SATURATE_BOTHWAYS(0, 2); + + /* Apply a few of entries, to force a snapshot to be taken. */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + /* Wait for snapshot to fail. */ + CLUSTER_STEP_UNTIL_ELAPSED(200); + /* napshot will have failed here. */ + + /* Set the non-faulty fsm->snapshot_async function */ + RESET_FSM_ASYNC(CLUSTER_LEADER); + CLUSTER_MAKE_PROGRESS; + + /* Wait for snapshot to be finished */ + CLUSTER_STEP_UNTIL_ELAPSED(200); + + /* Reconnect the follower and wait for it to catch up */ + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000); + + /* Check that the leader has sent a snapshot */ + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + return MUNIT_OK; +} + +TEST(snapshot, takeSnapshotFail, setUp, tearDown, 0, fsm_snapshot_async_params) +{ + struct fixture *f = data; + (void)params; + + SET_FAULTY_SNAPSHOT(); + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + + /* Apply a few of entries, to force a snapshot to be taken. */ + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* No crash or leaks have occurred */ + return MUNIT_OK; +} + +/* A follower doesn't convert to candidate state while it's installing a + * snapshot. */ +TEST(snapshot, snapshotBlocksCandidate, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + + /* Apply a few of entries, to force a snapshot to be taken. Drop all network + * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be + * replicated */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Reconnect both servers and set a high disk latency on server 2 */ + CLUSTER_SET_DISK_LATENCY(2, 5000); + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + + /* Wait a while and check that the leader has sent a snapshot */ + CLUSTER_STEP_UNTIL_ELAPSED(500); + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + + /* Disconnect the servers again so that heartbeats, etc. won't arrive */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + munit_assert_int(CLUSTER_STATE(2), ==, RAFT_FOLLOWER); + munit_assert_ptr(CLUSTER_RAFT(2)->snapshot.put.data, !=, NULL); + CLUSTER_STEP_UNTIL_ELAPSED(4000); + munit_assert_int(CLUSTER_STATE(2), ==, RAFT_FOLLOWER); + return MUNIT_OK; +} + +/* An UNAVAILABLE node doesn't install snapshots. */ +TEST(snapshot, unavailableDiscardsSnapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + + /* Apply a few of entries, to force a snapshot to be taken. Drop all network + * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be + * replicated */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Reconnect both servers */ + CLUSTER_SET_DISK_LATENCY(2, 600); + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + + /* Wait a while and check that the leader has sent a snapshot */ + CLUSTER_STEP_UNTIL_ELAPSED(500); + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + raft_fixture_make_unavailable(&f->cluster, 2); + CLUSTER_STEP_UNTIL_ELAPSED(500); + munit_assert_uint64(raft_last_applied(CLUSTER_RAFT(2)), ==, 1); + return MUNIT_OK; +} + +/* A new term starts while a node is installing a snapshot. */ +TEST(snapshot, newTermWhileInstalling, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + /* Set very low threshold and trailing entries number */ + SET_SNAPSHOT_THRESHOLD(3); + SET_SNAPSHOT_TRAILING(1); + + /* Apply a few of entries, to force a snapshot to be taken. Drop all network + * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be + * replicated */ + CLUSTER_SATURATE_BOTHWAYS(0, 2); + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + CLUSTER_MAKE_PROGRESS; + + /* Reconnect both servers */ + CLUSTER_SET_DISK_LATENCY(2, 3000); + CLUSTER_DESATURATE_BOTHWAYS(0, 2); + /* Wait a while and check that the leader has sent a snapshot */ + CLUSTER_STEP_UNTIL_ELAPSED(500); + munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1); + /* Force a new term to start */ + CLUSTER_DEPOSE; + CLUSTER_ELECT(1); + CLUSTER_STEP_UNTIL_ELAPSED(1000); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_start.c b/test/raft/integration/test_start.c new file mode 100644 index 000000000..d49cf2c88 --- /dev/null +++ b/test/raft/integration/test_start.c @@ -0,0 +1,223 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture with a fake raft_io instance. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +/* Bootstrap the I'th server. */ +#define BOOTSTRAP(I) \ + do { \ + struct raft_configuration _configuration; \ + int _rv; \ + struct raft *_raft; \ + CLUSTER_CONFIGURATION(&_configuration); \ + _raft = CLUSTER_RAFT(I); \ + _rv = raft_bootstrap(_raft, &_configuration); \ + munit_assert_int(_rv, ==, 0); \ + raft_configuration_close(&_configuration); \ + } while (0) + +/****************************************************************************** + * + * Set up a cluster with a single server. + * + *****************************************************************************/ + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(1); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * raft_start + * + *****************************************************************************/ + +SUITE(raft_start) + +/* There are two servers. The first has a snapshot present and no other + * entries. */ +TEST(raft_start, oneSnapshotAndNoEntries, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_GROW; + CLUSTER_SET_SNAPSHOT(0 /* server index */, + 6 /* last index */, + 2 /* last term */, + 1 /* conf index */, + 5 /* x */, + 7 /* y */); + CLUSTER_SET_TERM(0, 2); + BOOTSTRAP(1); + CLUSTER_START; + CLUSTER_MAKE_PROGRESS; + return MUNIT_OK; +} + +/* There are two servers. The first has a snapshot along with some follow-up + * entries. */ +TEST(raft_start, oneSnapshotAndSomeFollowUpEntries, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entries[2]; + struct raft_fsm *fsm; + + CLUSTER_GROW; + BOOTSTRAP(1); + + entries[0].type = RAFT_COMMAND; + entries[0].term = 2; + FsmEncodeSetX(6, &entries[0].buf); + + entries[1].type = RAFT_COMMAND; + entries[1].term = 2; + FsmEncodeAddY(2, &entries[1].buf); + + CLUSTER_SET_SNAPSHOT(0 /* */, + 6 /* last index */, + 2 /* last term */, + 1 /* conf index */, + 5 /* x */, + 7 /* y */); + CLUSTER_ADD_ENTRY(0, &entries[0]); + CLUSTER_ADD_ENTRY(1, &entries[1]); + CLUSTER_SET_TERM(0, 2); + + CLUSTER_START; + CLUSTER_MAKE_PROGRESS; + + fsm = CLUSTER_FSM(0); + munit_assert_int(FsmGetX(fsm), ==, 7); + + return MUNIT_OK; +} + +/****************************************************************************** + * + * Start with entries present on disk. + * + *****************************************************************************/ + +/* There are 3 servers. The first has no entries are present at all */ +TEST(raft_start, noEntries, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_GROW; + CLUSTER_GROW; + BOOTSTRAP(1); + BOOTSTRAP(2); + CLUSTER_START; + CLUSTER_MAKE_PROGRESS; + return MUNIT_OK; +} + +/* There are 3 servers, the first has some entries, the others don't. */ +TEST(raft_start, twoEntries, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_configuration configuration; + struct raft_entry entry; + struct raft_fsm *fsm; + unsigned i; + int rv; + + CLUSTER_GROW; + CLUSTER_GROW; + + CLUSTER_CONFIGURATION(&configuration); + rv = raft_bootstrap(CLUSTER_RAFT(0), &configuration); + munit_assert_int(rv, ==, 0); + raft_configuration_close(&configuration); + + entry.type = RAFT_COMMAND; + entry.term = 3; + FsmEncodeSetX(123, &entry.buf); + + CLUSTER_ADD_ENTRY(0, &entry); + CLUSTER_SET_TERM(0, 3); + + BOOTSTRAP(1); + BOOTSTRAP(2); + + CLUSTER_START; + CLUSTER_ELECT(0); + CLUSTER_MAKE_PROGRESS; + + CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 3, 3000); + + for (i = 0; i < CLUSTER_N; i++) { + fsm = CLUSTER_FSM(i); + munit_assert_int(FsmGetX(fsm), ==, 124); + } + + return MUNIT_OK; +} + +/* There is a single voting server in the cluster, which immediately elects + * itself when starting. */ +TEST(raft_start, singleVotingSelfElect, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_BOOTSTRAP; + CLUSTER_START; + munit_assert_int(CLUSTER_STATE(0), ==, RAFT_LEADER); + CLUSTER_MAKE_PROGRESS; + return MUNIT_OK; +} + +/* There are two servers in the cluster, one is voting and the other is + * not. When started, the non-voting server does not elects itself. */ +TEST(raft_start, singleVotingNotUs, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_GROW; + CLUSTER_BOOTSTRAP_N_VOTING(1); + CLUSTER_START; + munit_assert_int(CLUSTER_STATE(1), ==, RAFT_FOLLOWER); + CLUSTER_MAKE_PROGRESS; + return MUNIT_OK; +} + +static void state_cb(struct raft *r, unsigned short old, unsigned short new) +{ + munit_assert_ushort(old, !=, new); + r->data = (void *)(uintptr_t)0xFEEDBEEF; +} + +/* There is a single voting server in the cluster, register a state_cb and + * assert that it's called because the node will progress to leader. */ +TEST(raft_start, singleVotingWithStateCb, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_BOOTSTRAP; + struct raft *r = CLUSTER_RAFT(0); + r->data = (void *)(uintptr_t)0; + raft_register_state_cb(r, state_cb); + CLUSTER_START; + munit_assert_uint((uintptr_t)r->data, ==, 0xFEEDBEEF); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_strerror.c b/test/raft/integration/test_strerror.c new file mode 100644 index 000000000..ae45e1867 --- /dev/null +++ b/test/raft/integration/test_strerror.c @@ -0,0 +1,49 @@ +#include "../../../src/raft.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * raft_strerror + * + *****************************************************************************/ + +SUITE(raft_strerror) + +#define ERR_CODE_MAP(X) \ + X(RAFT_NOMEM) \ + X(RAFT_BADID) \ + X(RAFT_DUPLICATEID) \ + X(RAFT_DUPLICATEADDRESS) \ + X(RAFT_BADROLE) \ + X(RAFT_MALFORMED) \ + X(RAFT_NOTLEADER) \ + X(RAFT_LEADERSHIPLOST) \ + X(RAFT_SHUTDOWN) \ + X(RAFT_CANTBOOTSTRAP) \ + X(RAFT_CANTCHANGE) \ + X(RAFT_CORRUPT) \ + X(RAFT_CANCELED) \ + X(RAFT_NAMETOOLONG) \ + X(RAFT_TOOBIG) \ + X(RAFT_NOCONNECTION) \ + X(RAFT_BUSY) \ + X(RAFT_IOERR) + +#define TEST_CASE_STRERROR(CODE) \ + TEST(raft_strerror, CODE, NULL, NULL, 0, NULL) \ + { \ + (void)data; \ + (void)params; \ + munit_assert_not_null(raft_strerror(CODE)); \ + return MUNIT_OK; \ + } + +ERR_CODE_MAP(TEST_CASE_STRERROR) + +TEST(raft_strerror, default, NULL, NULL, 0, NULL) +{ + (void)data; + (void)params; + munit_assert_string_equal(raft_strerror(666), "unknown error"); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_tick.c b/test/raft/integration/test_tick.c new file mode 100644 index 000000000..807518b91 --- /dev/null +++ b/test/raft/integration/test_tick.c @@ -0,0 +1,261 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + const char *n_voting_param = munit_parameters_get(params, "n_voting"); + unsigned n = 3; + unsigned n_voting = n; + if (n_voting_param != NULL) { + n_voting = atoi(n_voting_param); + } + SETUP_CLUSTER(n); + CLUSTER_BOOTSTRAP_N_VOTING(n_voting); + CLUSTER_START; + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * Assertions + * + *****************************************************************************/ + +/* Assert the current value of the timer of the I'th raft instance */ +#define ASSERT_ELECTION_TIMER(I, MSECS) \ + { \ + struct raft *raft_ = CLUSTER_RAFT(I); \ + munit_assert_int( \ + raft_->io->time(raft_->io) - raft_->election_timer_start, ==, \ + MSECS); \ + } + +/* Assert the current state of the I'th raft instance. */ +#define ASSERT_STATE(I, STATE) munit_assert_int(CLUSTER_STATE(I), ==, STATE); + +/****************************************************************************** + * + * Tick callback + * + *****************************************************************************/ + +SUITE(tick) + +/* Internal timers are updated according to the given time delta. */ +TEST(tick, electionTimer, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + CLUSTER_STEP; + ASSERT_ELECTION_TIMER(0, 100); + + CLUSTER_STEP; + ASSERT_ELECTION_TIMER(1, 100); + + CLUSTER_STEP; + ASSERT_ELECTION_TIMER(2, 100); + + CLUSTER_STEP; + ASSERT_ELECTION_TIMER(0, 200); + + return MUNIT_OK; +} + +/* If the election timeout expires, the follower is a voting server, and it + * hasn't voted yet in this term, then become candidate and start a new + * election. */ +TEST(tick, candidate, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft = CLUSTER_RAFT(0); + (void)params; + + CLUSTER_STEP_UNTIL_ELAPSED( + raft->follower_state.randomized_election_timeout); + + /* The term has been incremented. */ + munit_assert_int(raft->current_term, ==, 2); + + /* We have voted for ourselves. */ + munit_assert_int(raft->voted_for, ==, 1); + + /* We are candidate */ + ASSERT_STATE(0, RAFT_CANDIDATE); + + /* The votes array is initialized */ + munit_assert_ptr_not_null(raft->candidate_state.votes); + munit_assert_true(raft->candidate_state.votes[0]); + munit_assert_false(raft->candidate_state.votes[1]); + + return MUNIT_OK; +} + +/* If the election timeout has not elapsed, stay follower. */ +TEST(tick, electionTimerNotExpired, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft = CLUSTER_RAFT(0); + (void)params; + + CLUSTER_STEP_UNTIL_ELAPSED( + raft->follower_state.randomized_election_timeout - 100); + ASSERT_STATE(0, RAFT_FOLLOWER); + + return MUNIT_OK; +} + +static char *elapse_non_voter_n_voting[] = {"1", NULL}; + +static MunitParameterEnum elapse_non_voter_params[] = { + {"n_voting", elapse_non_voter_n_voting}, + {NULL, NULL}, +}; + +/* If the election timeout has elapsed, but we're not voters, stay follower. */ +TEST(tick, not_voter, setUp, tearDown, 0, elapse_non_voter_params) +{ + struct fixture *f = data; + struct raft *raft = CLUSTER_RAFT(1); + (void)params; + + /* Prevent the timer of the first server from expiring. */ + raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 2000); + raft_set_election_timeout(CLUSTER_RAFT(0), 2000); + + CLUSTER_STEP_UNTIL_ELAPSED( + raft->follower_state.randomized_election_timeout + 100); + ASSERT_STATE(1, RAFT_FOLLOWER); + + return MUNIT_OK; +} + +/* If we're leader election timeout elapses without hearing from a majority of + * the cluster, step down. */ +TEST(tick, no_contact, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + (void)params; + + CLUSTER_ELECT(0); + CLUSTER_SATURATE_BOTHWAYS(0, 1); + CLUSTER_SATURATE_BOTHWAYS(0, 2); + + /* Wait for the leader to step down. */ + CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 2000); + + return MUNIT_OK; +} + +/* If we're candidate and the election timeout has elapsed, start a new + * election. */ +TEST(tick, new_election, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft = CLUSTER_RAFT(0); + + (void)params; + + CLUSTER_SATURATE_BOTHWAYS(0, 1); + CLUSTER_SATURATE_BOTHWAYS(0, 2); + + /* Become candidate */ + CLUSTER_STEP_UNTIL_ELAPSED( + raft->follower_state.randomized_election_timeout); + + /* Expire the election timeout */ + CLUSTER_STEP_UNTIL_ELAPSED( + raft->candidate_state.randomized_election_timeout); + + /* The term has been incremented and saved to stable store. */ + munit_assert_int(raft->current_term, ==, 3); + + /* We have voted for ourselves. */ + munit_assert_int(raft->voted_for, ==, 1); + + /* We are still candidate */ + ASSERT_STATE(0, RAFT_CANDIDATE); + + /* The votes array is initialized */ + munit_assert_ptr_not_null(raft->candidate_state.votes); + munit_assert_true(raft->candidate_state.votes[0]); + munit_assert_false(raft->candidate_state.votes[1]); + + return MUNIT_OK; +} + +/* If the election timeout has not elapsed, stay candidate. */ +TEST(tick, during_election, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft *raft = CLUSTER_RAFT(0); + (void)params; + + CLUSTER_SATURATE_BOTHWAYS(0, 1); + CLUSTER_SATURATE_BOTHWAYS(0, 2); + + /* Become candidate */ + CLUSTER_STEP_UNTIL_ELAPSED( + raft->follower_state.randomized_election_timeout); + + /* Make some time elapse, but not enough to trigger the timeout */ + CLUSTER_STEP_UNTIL_ELAPSED( + raft->candidate_state.randomized_election_timeout - 100); + + /* We are still candidate at the same term */ + ASSERT_STATE(0, RAFT_CANDIDATE); + munit_assert_int(raft->current_term, ==, 2); + + return MUNIT_OK; +} + +static char *elapse_request_vote_only_to_voters_n_voting[] = {"2", NULL}; + +static MunitParameterEnum elapse_request_vote_only_to_voters_params[] = { + {"n_voting", elapse_request_vote_only_to_voters_n_voting}, + {NULL, NULL}, +}; + +/* Vote requests are sent only to voting servers. */ +TEST(tick, + request_vote_only_to_voters, + setUp, + tearDown, + 0, + elapse_request_vote_only_to_voters_params) +{ + struct fixture *f = data; + struct raft *raft = CLUSTER_RAFT(0); + (void)params; + + CLUSTER_SATURATE_BOTHWAYS(0, 1); + CLUSTER_SATURATE_BOTHWAYS(0, 2); + + /* Become candidate */ + CLUSTER_STEP_UNTIL_ELAPSED( + raft->follower_state.randomized_election_timeout); + + /* We have sent vote requests only to the voting server */ + //__assert_request_vote(f, 2, 2, 1, 1); + + return MUNIT_OK; +} diff --git a/test/raft/integration/test_transfer.c b/test/raft/integration/test_transfer.c new file mode 100644 index 000000000..a51d70898 --- /dev/null +++ b/test/raft/integration/test_transfer.c @@ -0,0 +1,209 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture with a test raft cluster. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +static void transferCb(struct raft_transfer *req) +{ + bool *done = req->data; + munit_assert_false(*done); + *done = true; +} + +static bool transferCbHasFired(struct raft_fixture *f, void *arg) +{ + bool *done = arg; + (void)f; + return *done; +} + +/* Submit a transfer leadership request against the I'th server. */ +#define TRANSFER_SUBMIT(I, ID) \ + struct raft *_raft = CLUSTER_RAFT(I); \ + struct raft_transfer _req; \ + bool _done = false; \ + int _rv; \ + _req.data = &_done; \ + _rv = raft_transfer(_raft, &_req, ID, transferCb); \ + munit_assert_int(_rv, ==, 0); + +/* Wait until the transfer leadership request completes. */ +#define TRANSFER_WAIT CLUSTER_STEP_UNTIL(transferCbHasFired, &_done, 2000) + +/* Submit a transfer leadership request and wait for it to complete. */ +#define TRANSFER(I, ID) \ + do { \ + TRANSFER_SUBMIT(I, ID); \ + TRANSFER_WAIT; \ + } while (0) + +/* Submit a transfer leadership request against the I'th server and assert that + * the given error is returned. */ +#define TRANSFER_ERROR(I, ID, RV, ERRMSG) \ + do { \ + struct raft_transfer __req; \ + int __rv; \ + __rv = raft_transfer(CLUSTER_RAFT(I), &__req, ID, NULL); \ + munit_assert_int(__rv, ==, RV); \ + munit_assert_string_equal(CLUSTER_ERRMSG(I), ERRMSG); \ + } while (0) + +/****************************************************************************** + * + * Set up a cluster with a three servers. + * + *****************************************************************************/ + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(3); + CLUSTER_BOOTSTRAP; + CLUSTER_START; + CLUSTER_ELECT(0); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * raft_transfer + * + *****************************************************************************/ + +SUITE(raft_transfer) + +/* The follower we ask to transfer leadership to is up-to-date. */ +TEST(raft_transfer, upToDate, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + TRANSFER(0, 2); + CLUSTER_STEP_UNTIL_HAS_LEADER(1000); + munit_assert_int(CLUSTER_LEADER, ==, 1); + return MUNIT_OK; +} + +/* The follower we ask to transfer leadership to needs to catch up. */ +TEST(raft_transfer, catchUp, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply req; + CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, &req, 1, NULL); + TRANSFER(0, 2); + CLUSTER_STEP_UNTIL_HAS_LEADER(1000); + munit_assert_int(CLUSTER_LEADER, ==, 1); + return MUNIT_OK; +} + +/* The follower we ask to transfer leadership to is down and the leadership + * transfer does not succeed. */ +TEST(raft_transfer, expire, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_apply req; + CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, &req, 1, NULL); + CLUSTER_KILL(1); + TRANSFER(0, 2); + munit_assert_int(CLUSTER_LEADER, ==, 0); + return MUNIT_OK; +} + +/* The given ID doesn't match any server in the current configuration. */ +TEST(raft_transfer, unknownServer, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + TRANSFER_ERROR(0, 4, RAFT_BADID, "server ID is not valid"); + return MUNIT_OK; +} + +/* Submitting a transfer request twice is an error. */ +TEST(raft_transfer, twice, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + TRANSFER_SUBMIT(0, 2); + TRANSFER_ERROR(0, 3, RAFT_NOTLEADER, "server is not the leader"); + TRANSFER_WAIT; + return MUNIT_OK; +} + +/* If the given ID is zero, the target is selected automatically. */ +TEST(raft_transfer, autoSelect, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + TRANSFER(0, 0); + CLUSTER_STEP_UNTIL_HAS_LEADER(1000); + munit_assert_int(CLUSTER_LEADER, !=, 0); + return MUNIT_OK; +} + +/* If the given ID is zero, the target is selected automatically. Followers that + * are up-to-date are preferred. */ +TEST(raft_transfer, autoSelectUpToDate, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CLUSTER_KILL(1); + CLUSTER_MAKE_PROGRESS; + TRANSFER(0, 0); + CLUSTER_STEP_UNTIL_HAS_LEADER(1000); + munit_assert_int(CLUSTER_LEADER, ==, 2); + return MUNIT_OK; +} + +/* It's not possible to transfer leadership after the server has been + * demoted. */ +TEST(raft_transfer, afterDemotion, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_change req; + struct raft *raft = CLUSTER_RAFT(0); + int rv; + CLUSTER_ADD(&req); + CLUSTER_STEP_UNTIL_APPLIED(0, 3, 1000); + CLUSTER_ASSIGN(&req, RAFT_VOTER); + CLUSTER_STEP_UNTIL_APPLIED(0, 4, 1000); + rv = raft_assign(raft, &req, raft->id, RAFT_SPARE, NULL); + munit_assert_int(rv, ==, 0); + CLUSTER_STEP_UNTIL_APPLIED(0, 5, 1000); + TRANSFER_ERROR(0, 2, RAFT_NOTLEADER, "server is not the leader"); + return MUNIT_OK; +} + +static char *cluster_pre_vote[] = {"0", "1", NULL}; +static char *cluster_heartbeat[] = {"1", "100", NULL}; + +static MunitParameterEnum _params[] = { + {CLUSTER_PRE_VOTE_PARAM, cluster_pre_vote}, + {CLUSTER_HEARTBEAT_PARAM, cluster_heartbeat}, + {NULL, NULL}, +}; + +/* It's possible to transfer leadership also when pre-vote is active */ +TEST(raft_transfer, preVote, setUp, tearDown, 0, _params) +{ + struct fixture *f = data; + TRANSFER(0, 2); + CLUSTER_STEP_UNTIL_HAS_LEADER(1000); + munit_assert_int(CLUSTER_LEADER, ==, 1); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_uv_append.c b/test/raft/integration/test_uv_append.c new file mode 100644 index 000000000..11ff501ef --- /dev/null +++ b/test/raft/integration/test_uv_append.c @@ -0,0 +1,1005 @@ +#include "../../../src/raft/uv.h" +#include "../lib/aio.h" +#include "../lib/runner.h" +#include "../lib/uv.h" +#include "append_helpers.h" + +#include + +/* Maximum number of blocks a segment can have */ +#define MAX_SEGMENT_BLOCKS 4 + +/* This block size should work fine for all file systems. */ +#define SEGMENT_BLOCK_SIZE 4096 + +/* Default segment size */ +#define SEGMENT_SIZE 4096 * MAX_SEGMENT_BLOCKS + +/****************************************************************************** + * + * Fixture with a libuv-based raft_io instance. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_UV_DEPS; + FIXTURE_UV; + int count; /* To generate deterministic entry data */ +}; + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_UV_DEPS; + SETUP_UV; + raft_uv_set_block_size(&f->io, SEGMENT_BLOCK_SIZE); + raft_uv_set_segment_size(&f->io, SEGMENT_SIZE); + f->count = 0; + return f; +} + +static void tearDownDeps(void *data) +{ + struct fixture *f = data; + if (f == NULL) { + return; + } + TEAR_DOWN_UV_DEPS; + free(f); +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + if (f == NULL) { + return; + } + TEAR_DOWN_UV; + tearDownDeps(f); +} + +/****************************************************************************** + * + * Assertions + * + *****************************************************************************/ + +/* Shutdown the fixture's raft_io instance, then load all entries on disk using + * a new raft_io instance, and assert that there are N entries with a total data + * size of TOTAL_DATA_SIZE bytes. */ +#define ASSERT_ENTRIES(N, TOTAL_DATA_SIZE) \ + TEAR_DOWN_UV; \ + do { \ + struct uv_loop_s _loop; \ + struct raft_uv_transport _transport; \ + struct raft_io _io; \ + raft_term _term; \ + raft_id _voted_for; \ + struct raft_snapshot *_snapshot; \ + raft_index _start_index; \ + struct raft_entry *_entries; \ + size_t _i; \ + size_t _n; \ + void *_batch = NULL; \ + size_t _total_data_size = 0; \ + int _rv; \ + \ + _rv = uv_loop_init(&_loop); \ + munit_assert_int(_rv, ==, 0); \ + _transport.version = 1; \ + _rv = raft_uv_tcp_init(&_transport, &_loop); \ + munit_assert_int(_rv, ==, 0); \ + _rv = raft_uv_init(&_io, &_loop, f->dir, &_transport); \ + munit_assert_int(_rv, ==, 0); \ + _rv = _io.init(&_io, 1, "1"); \ + if (_rv != 0) { \ + munit_errorf("io->init(): %s (%d)", _io.errmsg, _rv); \ + } \ + _rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \ + &_entries, &_n); \ + if (_rv != 0) { \ + munit_errorf("io->load(): %s (%d)", _io.errmsg, _rv); \ + } \ + _io.close(&_io, NULL); \ + uv_run(&_loop, UV_RUN_NOWAIT); \ + raft_uv_close(&_io); \ + raft_uv_tcp_close(&_transport); \ + uv_loop_close(&_loop); \ + \ + munit_assert_ptr_null(_snapshot); \ + munit_assert_int(_n, ==, N); \ + for (_i = 0; _i < _n; _i++) { \ + struct raft_entry *_entry = &_entries[_i]; \ + uint64_t _value = *(uint64_t *)_entry->buf.base; \ + munit_assert_int(_entry->term, ==, 1); \ + munit_assert_int(_entry->type, ==, RAFT_COMMAND); \ + munit_assert_int(_value, ==, _i); \ + munit_assert_ptr_not_null(_entry->batch); \ + } \ + for (_i = 0; _i < _n; _i++) { \ + struct raft_entry *_entry = &_entries[_i]; \ + if (_entry->batch != _batch) { \ + _batch = _entry->batch; \ + raft_free(_batch); \ + } \ + _total_data_size += _entry->buf.len; \ + } \ + raft_free(_entries); \ + munit_assert_int(_total_data_size, ==, TOTAL_DATA_SIZE); \ + } while (0); + +/****************************************************************************** + * + * raft_io->append() + * + *****************************************************************************/ + +SUITE(append) + +/* Append an entries array containing unaligned buffers. */ +TEST(append, unaligned, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_SUBMIT_CB_DATA(0, 1, 9, NULL, NULL, RAFT_INVALID); + munit_assert_string_equal(f->io.errmsg, + "entry buffers must be 8-byte aligned"); + APPEND_SUBMIT_CB_DATA(1, 3, 63, NULL, NULL, RAFT_INVALID); + munit_assert_string_equal(f->io.errmsg, + "entry buffers must be 8-byte aligned"); + return MUNIT_OK; +} + +/* Append the very first batch of entries. */ +TEST(append, first, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND(1, 64); + ASSERT_ENTRIES(1, 64); + return MUNIT_OK; +} + +/* As soon as the backend starts writing the first open segment, a second one + * and a third one get prepared. */ +TEST(append, prepareSegments, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1, 64); + while (!DirHasFile(f->dir, "open-3")) { + LOOP_RUN(1); + } + munit_assert_true(DirHasFile(f->dir, "open-1")); + munit_assert_true(DirHasFile(f->dir, "open-2")); + munit_assert_true(DirHasFile(f->dir, "open-3")); + return MUNIT_OK; +} + +/* Once the first segment fills up, it gets finalized, and an additional one + * gets prepared, to maintain the available segments pool size. */ +TEST(append, finalizeSegment, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); + APPEND(1, 64); + while (!DirHasFile(f->dir, "open-4")) { + LOOP_RUN(1); + } + munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000004")); + munit_assert_false(DirHasFile(f->dir, "open-1")); + munit_assert_true(DirHasFile(f->dir, "open-4")); + return MUNIT_OK; +} + +/* The very first batch of entries to append is bigger than the regular open + * segment size. */ +TEST(append, firstBig, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); + ASSERT_ENTRIES(MAX_SEGMENT_BLOCKS, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE); + return MUNIT_OK; +} + +/* The second batch of entries to append is bigger than the regular open + * segment size. */ +TEST(append, secondBig, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1, 64); + APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); + return MUNIT_OK; +} + +/* Schedule multiple appends each one exceeding the segment size. */ +TEST(append, severalBig, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND_SUBMIT(0, 2, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE); + APPEND_SUBMIT(1, 2, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE); + APPEND_SUBMIT(2, 2, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE); + APPEND_WAIT(0); + APPEND_WAIT(1); + APPEND_WAIT(2); + ASSERT_ENTRIES(6, 6 * MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE); + return MUNIT_OK; +} + +/* Write the very first entry and then another one, both fitting in the same + * block. */ +TEST(append, fitBlock, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND(1, 64); + APPEND(1, 64); + ASSERT_ENTRIES(2, 128); + return MUNIT_OK; +} + +/* Write an entry that fills the first block exactly and then another one. */ +TEST(append, matchBlock, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + size_t size; + + size = SEGMENT_BLOCK_SIZE; + size -= sizeof(uint64_t) + /* Format */ + sizeof(uint64_t) + /* Checksums */ + 8 + 16; /* Header */ + + APPEND(1, size); + APPEND(1, 64); + + ASSERT_ENTRIES(2, size + 64); + + return MUNIT_OK; +} + +/* Write an entry that exceeds the first block, then another one that fits in + * the second block, then a third one that fills the rest of the second block + * plus the whole third block exactly, and finally a fourth entry that fits in + * the fourth block */ +TEST(append, exceedBlock, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + size_t written; + size_t size1; + size_t size2; + + size1 = SEGMENT_BLOCK_SIZE; + + APPEND(1, size1); + APPEND(1, 64); + + written = sizeof(uint64_t) + /* Format version */ + 2 * sizeof(uint32_t) + /* CRC sums of first batch */ + 8 + 16 + /* Header of first batch */ + size1 + /* Size of first batch */ + 2 * sizeof(uint32_t) + /* CRC of second batch */ + 8 + 16 + /* Header of second batch */ + 64; /* Size of second batch */ + + /* Write a third entry that fills the second block exactly */ + size2 = SEGMENT_BLOCK_SIZE - (written % SEGMENT_BLOCK_SIZE); + size2 -= (2 * sizeof(uint32_t) + 8 + 16); + size2 += SEGMENT_BLOCK_SIZE; + + APPEND(1, size2); + + /* Write a fourth entry */ + APPEND(1, 64); + + ASSERT_ENTRIES(4, size1 + 64 + size2 + 64); + + return MUNIT_OK; +} + +/* If an append request is submitted before the write operation of the previous + * append request is started, then a single write will be performed for both + * requests. */ +TEST(append, batch, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_SUBMIT(0, 1, 64); + APPEND_SUBMIT(1, 1, 64); + APPEND_WAIT(0); + APPEND_WAIT(1); + return MUNIT_OK; +} + +/* An append request submitted while a write operation is in progress gets + * executed only when the write completes. */ +TEST(append, wait, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_SUBMIT(0, 1, 64); + LOOP_RUN(1); + APPEND_SUBMIT(1, 1, 64); + APPEND_WAIT(0); + APPEND_WAIT(1); + return MUNIT_OK; +} + +/* Several batches with different size gets appended in fast pace, forcing the + * segment arena to grow. */ +TEST(append, resizeArena, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND_SUBMIT(0, 2, 64); + APPEND_SUBMIT(1, 1, SEGMENT_BLOCK_SIZE); + APPEND_SUBMIT(2, 2, 64); + APPEND_SUBMIT(3, 1, SEGMENT_BLOCK_SIZE); + APPEND_SUBMIT(4, 1, SEGMENT_BLOCK_SIZE); + APPEND_WAIT(0); + APPEND_WAIT(1); + APPEND_WAIT(2); + APPEND_WAIT(3); + APPEND_WAIT(4); + ASSERT_ENTRIES(7, 64 * 4 + SEGMENT_BLOCK_SIZE * 3); + return MUNIT_OK; +} + +/* A few append requests get queued, then a truncate request comes in and other + * append requests right after, before truncation is fully completed. */ +TEST(append, truncate, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + int rv; + + return MUNIT_SKIP; /* FIXME: flaky */ + + APPEND(2, 64); + + APPEND_SUBMIT(0, 2, 64); + + rv = f->io.truncate(&f->io, 2); + munit_assert_int(rv, ==, 0); + + APPEND_SUBMIT(1, 2, 64); + + APPEND_WAIT(0); + APPEND_WAIT(1); + + return MUNIT_OK; +} + +/* A few append requests get queued, then a truncate request comes in and other + * append requests right after, before truncation is fully completed. However + * the backend is closed before the truncation request can be processed. */ +TEST(append, truncateClosing, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + int rv; + APPEND(2, 64); + APPEND_SUBMIT(0, 2, 64); + rv = f->io.truncate(&f->io, 2); + munit_assert_int(rv, ==, 0); + APPEND_SUBMIT(1, 2, 64); + APPEND_EXPECT(1, RAFT_CANCELED); + TEAR_DOWN_UV; + return MUNIT_OK; +} + +/* A few append requests get queued, however the backend is closed before + * preparing the second segment completes. */ +TEST(append, prepareClosing, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND_SUBMIT(0, 2, 64); + LOOP_RUN(1); + TEAR_DOWN_UV; + return MUNIT_OK; +} + +/* The counters of the open segments get increased as they are closed. */ +TEST(append, counter, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + size_t size = SEGMENT_BLOCK_SIZE; + int i; + for (i = 0; i < 10; i++) { + APPEND(1, size); + } + munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000003")); + munit_assert_true(DirHasFile(f->dir, "0000000000000004-0000000000000006")); + munit_assert_true(DirHasFile(f->dir, "open-4")); + return MUNIT_OK; +} + +/* If the I/O instance is closed, all pending append requests get canceled. */ +TEST(append, cancel, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND_SUBMIT(0, 1, 64); + APPEND_EXPECT(0, RAFT_CANCELED); + TEAR_DOWN_UV; + return MUNIT_OK; +} + +/* The creation of the current open segment fails because there's no space. */ +TEST(append, noSpaceUponPrepareCurrent, setUp, tearDown, 0, DirTmpfsParams) +{ + struct fixture *f = data; + SKIP_IF_NO_FIXTURE; + raft_uv_set_segment_size(&f->io, SEGMENT_BLOCK_SIZE * 32768); + APPEND_FAILURE( + 1, 64, RAFT_NOSPACE, + "create segment open-1: not enough space to allocate 134217728 bytes"); + return MUNIT_OK; +} + +/* The creation of a spare open segment fails because there's no space. */ +TEST(append, noSpaceUponPrepareSpare, setUp, tearDown, 0, DirTmpfsParams) +{ + struct fixture *f = data; + SKIP_IF_NO_FIXTURE; +#if defined(__powerpc64__) + /* XXX: fails on ppc64el */ + return MUNIT_SKIP; +#endif + raft_uv_set_segment_size(&f->io, SEGMENT_BLOCK_SIZE * 2); + DirFill(f->dir, SEGMENT_BLOCK_SIZE * 3); + APPEND(1, SEGMENT_BLOCK_SIZE); + APPEND_SUBMIT(0, 1, SEGMENT_BLOCK_SIZE); + APPEND_EXPECT(0, RAFT_NOSPACE); + APPEND_WAIT(0); + return MUNIT_OK; +} + +/* The write request fails because there's not enough space. */ +TEST(append, noSpaceUponWrite, setUp, tearDownDeps, 0, DirTmpfsParams) +{ + struct fixture *f = data; + SKIP_IF_NO_FIXTURE; +#if defined(__powerpc64__) + /* XXX: fails on ppc64el */ + TEAR_DOWN_UV; + return MUNIT_SKIP; +#endif + raft_uv_set_segment_size(&f->io, SEGMENT_BLOCK_SIZE); + DirFill(f->dir, SEGMENT_BLOCK_SIZE * 2); + APPEND(1, 64); + APPEND_FAILURE(1, (SEGMENT_BLOCK_SIZE + 128), RAFT_NOSPACE, + "short write: 4096 bytes instead of 8192"); + DirRemoveFile(f->dir, ".fill"); + LOOP_RUN(50); + APPEND(5, 64); + ASSERT_ENTRIES(6, 384); + return MUNIT_OK; +} + +/* A few requests fail because not enough disk space is available. Eventually + * the space is released and the request succeeds. */ +TEST(append, noSpaceResolved, setUp, tearDownDeps, 0, DirTmpfsParams) +{ + struct fixture *f = data; + SKIP_IF_NO_FIXTURE; +#if defined(__powerpc64__) + /* XXX: fails on ppc64el */ + TEAR_DOWN_UV; + return MUNIT_SKIP; +#endif + DirFill(f->dir, SEGMENT_BLOCK_SIZE); + APPEND_FAILURE( + 1, 64, RAFT_NOSPACE, + "create segment open-1: not enough space to allocate 16384 bytes"); + APPEND_FAILURE( + 1, 64, RAFT_NOSPACE, + "create segment open-2: not enough space to allocate 16384 bytes"); + DirRemoveFile(f->dir, ".fill"); + f->count = 0; /* Reset the data counter */ + APPEND(1, 64); + ASSERT_ENTRIES(1, 64); + return MUNIT_OK; +} + +/* An error occurs while performing a write. */ +TEST(append, writeError, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + aio_context_t ctx = 0; + + /* FIXME: doesn't fail anymore after + * https://github.com/CanonicalLtd/raft/pull/49 */ + return MUNIT_SKIP; + + APPEND_SUBMIT(0, 1, 64); + AioFill(&ctx, 0); + APPEND_WAIT(0); + AioDestroy(ctx); + return MUNIT_OK; +} + +static char *oomHeapFaultDelay[] = {"1", /* FIXME "2", */ NULL}; +static char *oomHeapFaultRepeat[] = {"1", NULL}; + +static MunitParameterEnum oomParams[] = { + {TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay}, + {TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat}, + {NULL, NULL}, +}; + +/* Out of memory conditions. */ +TEST(append, oom, setUp, tearDown, 0, oomParams) +{ + struct fixture *f = data; + HEAP_FAULT_ENABLE; + APPEND_ERROR(1, 64, RAFT_NOMEM, ""); + return MUNIT_OK; +} + +/* The uv instance is closed while a write request is in progress. */ +TEST(append, closeDuringWrite, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + /* TODO: broken */ + return MUNIT_SKIP; + + APPEND_SUBMIT(0, 1, 64); + LOOP_RUN(1); + TEAR_DOWN_UV; + + return MUNIT_OK; +} + +/* When the backend is closed, all unused open segments get removed. */ +TEST(append, removeSegmentUponClose, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND(1, 64); + while (!DirHasFile(f->dir, "open-2")) { + LOOP_RUN(1); + } + TEAR_DOWN_UV; + munit_assert_false(DirHasFile(f->dir, "open-2")); + return MUNIT_OK; +} + +/* When the backend is closed, all pending prepare get requests get canceled. */ +TEST(append, cancelPrepareRequest, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + /* TODO: find a way to test a prepare request cancelation */ + return MUNIT_SKIP; + APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); + APPEND_SUBMIT(0, 1, 64); + APPEND_EXPECT(0, RAFT_CANCELED); + TEAR_DOWN_UV; + return MUNIT_OK; +} + +/* When the writer gets closed it tells the writer to close the segment that + * it's currently writing. */ +TEST(append, currentSegment, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + + APPEND(1, 64); + + TEAR_DOWN_UV; + + munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000001")); + + return MUNIT_OK; +} + +/* The kernel has ran out of available AIO events. */ +TEST(append, ioSetupError, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + aio_context_t ctx = 0; + int rv; + rv = AioFill(&ctx, 0); + if (rv != 0) { + return MUNIT_SKIP; + } + APPEND_FAILURE(1, 64, RAFT_TOOMANY, + "setup writer for open-1: AIO events user limit exceeded"); + return MUNIT_OK; +} + +/*=========================================================================== + Test interaction between UvAppend and UvBarrier + ===========================================================================*/ + +struct barrierData +{ + int current; /* Count the number of finished AppendEntries RPCs */ + int expected; /* Expected number of finished AppendEntries RPCs */ + bool done; /* @true if the Barrier CB has fired */ + bool expectDone; /* Expect the Barrier CB to have fired or not */ + char **files; /* Expected files in the directory, NULL terminated */ + struct uv *uv; +}; + +static void barrierCbCompareCounter(struct UvBarrierReq *barrier) +{ + struct barrierData *bd = barrier->data; + munit_assert_false(bd->done); + bd->done = true; + struct uv *uv = bd->uv; + UvUnblock(uv); + munit_assert_int(bd->current, ==, bd->expected); + if (bd->files != NULL) { + int i = 0; + while (bd->files[i] != NULL) { + munit_assert_true(DirHasFile(uv->dir, bd->files[i])); + ++i; + } + } +} + +static void barrierDoneCb(struct UvBarrierReq *barrier) +{ + struct barrierData *bd = barrier->data; + munit_assert_false(bd->done); + bd->done = true; +} + +static void appendCbIncreaseCounterAssertResult(struct raft_io_append *req, + int status) +{ + struct result *result = req->data; + munit_assert_int(status, ==, result->status); + result->done = true; + struct barrierData *bd = result->data; + munit_assert_true(bd->done == bd->expectDone); + bd->current += 1; +} + +static void appendDummyCb(struct raft_io_append *req, int status) +{ + (void)req; + (void)status; +} + +static char *bools[] = {"0", "1", NULL}; +static MunitParameterEnum blocking_bool_params[] = { + {"bool", bools}, + {NULL, NULL}, +}; + +/* Fill up 3 segments worth of AppendEntries RPC's. + * Request a Barrier and expect that the AppendEntries RPC's are finished before + * the Barrier callback is fired. + */ +TEST(append, barrierOpenSegments, setUp, tearDown, 0, blocking_bool_params) +{ + struct fixture *f = data; + struct barrierData bd = {0}; + bd.current = 0; + bd.expected = 3; + bd.done = false; + bd.expectDone = false; + bd.uv = f->io.impl; + char *files[] = {"0000000000000001-0000000000000004", + "0000000000000005-0000000000000008", + "0000000000000009-0000000000000012", NULL}; + bd.files = files; + + APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendCbIncreaseCounterAssertResult, &bd, 0); + APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendCbIncreaseCounterAssertResult, &bd, 0); + APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendCbIncreaseCounterAssertResult, &bd, 0); + + struct UvBarrierReq barrier = {0}; + barrier.data = (void *)&bd; + barrier.blocking = + (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0); + barrier.cb = barrierCbCompareCounter; + UvBarrier(f->io.impl, 1, &barrier); + + /* Make sure every callback fired */ + LOOP_RUN_UNTIL(&bd.done); + APPEND_WAIT(0); + APPEND_WAIT(1); + APPEND_WAIT(2); + return MUNIT_OK; +} + +/* Fill up 3 segments worth of AppendEntries RPC's. + * Request a Barrier and stop early. + */ +TEST(append, barrierOpenSegmentsExitEarly, setUp, NULL, 0, blocking_bool_params) +{ + struct fixture *f = data; + struct barrierData bd = {0}; + bd.current = 0; + bd.expected = 3; + bd.done = false; + bd.expectDone = false; + bd.uv = f->io.impl; + char *files[] = {"0000000000000001-0000000000000004", + "0000000000000005-0000000000000008", + "0000000000000009-0000000000000012", NULL}; + bd.files = files; + + APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendDummyCb, NULL, 0); + APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendDummyCb, NULL, 0); + APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendDummyCb, NULL, 0); + + struct UvBarrierReq barrier = {0}; + barrier.data = (void *)&bd; + barrier.blocking = + (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0); + barrier.cb = barrierDoneCb; + UvBarrier(f->io.impl, 1, &barrier); + + /* Exit early. */ + tearDown(data); + munit_assert_true(bd.done); + + return MUNIT_OK; +} + +/* Fill up 3 segments worth of AppendEntries RPC's. + * Request a 2 barriers and expect their callbacks to fire. + */ +TEST(append, twoBarriersOpenSegments, setUp, tearDown, 0, blocking_bool_params) +{ + struct fixture *f = data; + struct barrierData bd1 = {0}; + bd1.current = 0; + bd1.expected = 3; + bd1.done = false; + bd1.expectDone = false; + bd1.uv = f->io.impl; + char *files[] = {"0000000000000001-0000000000000004", + "0000000000000005-0000000000000008", + "0000000000000009-0000000000000012", NULL}; + bd1.files = files; + /* Only expect the callback to eventually fire. */ + struct barrierData bd2 = {0}; + bd2.uv = f->io.impl; + + APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendCbIncreaseCounterAssertResult, &bd1, 0); + APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendCbIncreaseCounterAssertResult, &bd1, 0); + APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendCbIncreaseCounterAssertResult, &bd1, 0); + + struct UvBarrierReq barrier1 = {0}; + barrier1.data = (void *)&bd1; + barrier1.blocking = + (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0); + barrier1.cb = barrierCbCompareCounter; + UvBarrier(f->io.impl, 1, &barrier1); + struct UvBarrierReq barrier2 = {0}; + barrier2.data = (void *)&bd2; + barrier2.blocking = + (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0); + barrier2.cb = barrierCbCompareCounter; + UvBarrier(f->io.impl, 1, &barrier2); + + /* Make sure every callback fired */ + LOOP_RUN_UNTIL(&bd1.done); + LOOP_RUN_UNTIL(&bd2.done); + APPEND_WAIT(0); + APPEND_WAIT(1); + APPEND_WAIT(2); + return MUNIT_OK; +} + +/* Fill up 3 segments worth of AppendEntries RPC's. + * Request 2 barriers and exit early. + */ +TEST(append, twoBarriersExitEarly, setUp, NULL, 0, blocking_bool_params) +{ + struct fixture *f = data; + struct barrierData bd1 = {0}; + bd1.current = 0; + bd1.expected = 3; + bd1.done = false; + bd1.expectDone = false; + bd1.uv = f->io.impl; + char *files[] = {"0000000000000001-0000000000000004", + "0000000000000005-0000000000000008", + "0000000000000009-0000000000000012", NULL}; + bd1.files = files; + /* Only expect the callback to eventually fire. */ + struct barrierData bd2 = {0}; + bd2.uv = f->io.impl; + + APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendDummyCb, NULL, 0); + APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendDummyCb, NULL, 0); + APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendDummyCb, NULL, 0); + + struct UvBarrierReq barrier1 = {0}; + barrier1.data = (void *)&bd1; + barrier1.blocking = + (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0); + barrier1.cb = barrierDoneCb; + UvBarrier(f->io.impl, 1, &barrier1); + struct UvBarrierReq barrier2 = {0}; + barrier2.data = (void *)&bd2; + barrier2.blocking = + (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0); + barrier2.cb = barrierDoneCb; + UvBarrier(f->io.impl, 1, &barrier2); + + /* Exit early. */ + tearDown(data); + munit_assert_true(bd1.done); + munit_assert_true(bd2.done); + + return MUNIT_OK; +} + +/* Request a blocking Barrier and expect that the no AppendEntries RPC's are + * finished before the Barrier callback is fired. + */ +TEST(append, blockingBarrierNoOpenSegments, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct barrierData bd = {0}; + bd.current = 0; + bd.expected = 0; + bd.done = false; + bd.expectDone = true; + bd.uv = f->io.impl; + + struct UvBarrierReq barrier = {0}; + barrier.data = (void *)&bd; + barrier.blocking = true; + barrier.cb = barrierCbCompareCounter; + UvBarrier(f->io.impl, 1, &barrier); + + APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendCbIncreaseCounterAssertResult, &bd, 0); + APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendCbIncreaseCounterAssertResult, &bd, 0); + APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendCbIncreaseCounterAssertResult, &bd, 0); + + /* Make sure every callback fired */ + LOOP_RUN_UNTIL(&bd.done); + APPEND_WAIT(0); + APPEND_WAIT(1); + APPEND_WAIT(2); + return MUNIT_OK; +} + +/* Request a blocking Barrier and expect that the no AppendEntries RPC's are + * finished before the Barrier callback is fired. */ +TEST(append, blockingBarrierSingleOpenSegment, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct barrierData bd = {0}; + bd.current = 0; + bd.expected = 0; + bd.done = false; + bd.expectDone = true; + bd.uv = f->io.impl; + char *files[] = {"0000000000000001-0000000000000001", NULL}; + bd.files = files; + + /* Wait until there is at least 1 open segment otherwise + * the barrier Cb is fired immediately. */ + APPEND(1, 64); + while (!DirHasFile(f->dir, "open-1")) { + LOOP_RUN(1); + } + + struct UvBarrierReq barrier = {0}; + barrier.data = (void *)&bd; + barrier.blocking = true; + barrier.cb = barrierCbCompareCounter; + UvBarrier(f->io.impl, 1, &barrier); + + APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendCbIncreaseCounterAssertResult, &bd, 0); + APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendCbIncreaseCounterAssertResult, &bd, 0); + APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE, + appendCbIncreaseCounterAssertResult, &bd, 0); + + /* Make sure every callback fired */ + LOOP_RUN_UNTIL(&bd.done); + APPEND_WAIT(0); + APPEND_WAIT(1); + APPEND_WAIT(2); + return MUNIT_OK; +} + +static void longWorkCb(uv_work_t *work) +{ + (void)work; + sleep(1); +} + +static void longAfterWorkCb(uv_work_t *work, int status) +{ + struct barrierData *bd = work->data; + munit_assert_false(bd->done); + bd->done = true; + munit_assert_int(status, ==, 0); + struct uv *uv = bd->uv; + UvUnblock(uv); + munit_assert_int(bd->current, ==, bd->expected); + free(work); +} + +static void barrierCbLongWork(struct UvBarrierReq *barrier) +{ + struct barrierData *bd = barrier->data; + munit_assert_false(bd->done); + struct uv *uv = bd->uv; + int rv; + + uv_work_t *work = munit_malloc(sizeof(*work)); + munit_assert_ptr_not_null(work); + work->data = bd; + + rv = uv_queue_work(uv->loop, work, longWorkCb, longAfterWorkCb); + munit_assert_int(rv, ==, 0); +} + +/* Request a non-blocking Barrier that triggers a long-running task, the barrier + * is removed when the long running task completes. This simulates a large + * snapshot write. Ensure Append requests complete before the long running task + * completes.*/ +TEST(append, nonBlockingBarrierLongBlockingTask, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct barrierData bd = {0}; + bd.current = 0; + bd.expected = 1; + bd.done = false; + bd.expectDone = false; + bd.uv = f->io.impl; + + struct UvBarrierReq barrier = {0}; + barrier.data = (void *)&bd; + barrier.blocking = false; + barrier.cb = barrierCbLongWork; + UvBarrier(f->io.impl, bd.uv->append_next_index, &barrier); + APPEND_SUBMIT_CB_DATA(0, 1, 64, appendCbIncreaseCounterAssertResult, &bd, + 0); + + /* Make sure every callback fired */ + LOOP_RUN_UNTIL(&bd.done); + APPEND_WAIT(0); + return MUNIT_OK; +} + +/* Request a blocking Barrier that triggers a long-running task, the barrier + * is unblocked and removed when the long running task completes. This simulates + * a large snapshot install. Ensure Append requests complete after the work + * completes.*/ +TEST(append, blockingBarrierLongBlockingTask, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct barrierData bd = {0}; + bd.current = 0; + bd.expected = 0; + bd.done = false; + bd.expectDone = true; + bd.uv = f->io.impl; + + struct UvBarrierReq barrier = {0}; + barrier.data = (void *)&bd; + barrier.blocking = true; + barrier.cb = barrierCbLongWork; + UvBarrier(f->io.impl, bd.uv->append_next_index, &barrier); + APPEND_SUBMIT_CB_DATA(0, 1, 64, appendCbIncreaseCounterAssertResult, &bd, + 0); + + /* Make sure every callback fired */ + LOOP_RUN_UNTIL(&bd.done); + APPEND_WAIT(0); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_uv_bootstrap.c b/test/raft/integration/test_uv_bootstrap.c new file mode 100644 index 000000000..e987f15cb --- /dev/null +++ b/test/raft/integration/test_uv_bootstrap.c @@ -0,0 +1,98 @@ +#include "../lib/runner.h" +#include "../lib/uv.h" + +/****************************************************************************** + * + * Fixture with a libuv-based raft_io instance and an empty configuration. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_UV_DEPS; + FIXTURE_UV; + struct raft_configuration conf; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +/* Add a server to the fixture's configuration. */ +#define CONFIGURATION_ADD(ID, ADDRESS) \ + { \ + int rv_; \ + rv_ = raft_configuration_add(&f->conf, ID, ADDRESS, RAFT_VOTER); \ + munit_assert_int(rv_, ==, 0); \ + } + +/* Invoke f->io->bootstrap() and assert that no error occurs. */ +#define BOOTSTRAP \ + { \ + int rv_; \ + rv_ = f->io.bootstrap(&f->io, &f->conf); \ + munit_assert_int(rv_, ==, 0); \ + } + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_UV_DEPS; + SETUP_UV; + raft_configuration_init(&f->conf); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + raft_configuration_close(&f->conf); + TEAR_DOWN_UV; + TEAR_DOWN_UV_DEPS; + free(f); +} + +/****************************************************************************** + * + * raft_io->bootstrap() + * + *****************************************************************************/ + +SUITE(bootstrap) + +/* Invoke f->io->bootstrap() and assert that it returns the given error code and + * message. */ +#define BOOTSTRAP_ERROR(RV, ERRMSG) \ + { \ + int rv_; \ + rv_ = f->io.bootstrap(&f->io, &f->conf); \ + munit_assert_int(rv_, ==, RV); \ + munit_assert_string_equal(f->io.errmsg, ERRMSG); \ + } + +/* Bootstrap a pristine server. */ +TEST(bootstrap, pristine, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CONFIGURATION_ADD(1, "1"); + BOOTSTRAP; + return MUNIT_OK; +} + +/* The data directory already has metadata files with a non-zero term. */ +TEST(bootstrap, termIsNonZero, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CONFIGURATION_ADD(1, "1"); + BOOTSTRAP; + BOOTSTRAP_ERROR(RAFT_CANTBOOTSTRAP, "metadata contains term 1"); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_uv_init.c b/test/raft/integration/test_uv_init.c new file mode 100644 index 000000000..d4358c689 --- /dev/null +++ b/test/raft/integration/test_uv_init.c @@ -0,0 +1,268 @@ +#include "../../../src/raft.h" +#include "../../../src/raft/byte.h" +#include "../lib/runner.h" +#include "../lib/uv.h" + +#include +#include + +/****************************************************************************** + * + * Fixture with a non-initialized raft_io instance and uv dependencies. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_UV_DEPS; + FIXTURE_UV; + bool closed; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +static void closeCb(struct raft_io *io) +{ + struct fixture *f = io->data; + f->closed = true; +} + +/* Invoke raft_uv_init() and assert that no error occurs. */ +#define INIT(DIR) \ + do { \ + int _rv; \ + _rv = raft_uv_init(&f->io, &f->loop, DIR, &f->transport); \ + munit_assert_int(_rv, ==, 0); \ + _rv = f->io.init(&f->io, 1, "1"); \ + munit_assert_int(_rv, ==, 0); \ + } while (0) + +/* Invoke raft_io->close(). */ +#define CLOSE \ + do { \ + f->io.close(&f->io, closeCb); \ + LOOP_RUN_UNTIL(&f->closed); \ + raft_uv_close(&f->io); \ + } while (0) + +/* Invoke raft_uv_init() and assert that the given error code is returned and + * the given error message set. */ +#define INIT_ERROR(DIR, RV, ERRMSG) \ + do { \ + int _rv; \ + _rv = raft_uv_init(&f->io, &f->loop, DIR, &f->transport); \ + munit_assert_int(_rv, ==, 0); \ + _rv = f->io.init(&f->io, 1, "1"); \ + munit_assert_int(_rv, ==, RV); \ + munit_assert_string_equal(f->io.errmsg, ERRMSG); \ + CLOSE; \ + } while (0) + +/* Write either the metadata1 or metadata2 file, filling it with the given + * values. */ +#define WRITE_METADATA_FILE(N, FORMAT, VERSION, TERM, VOTED_FOR) \ + { \ + uint8_t buf[8 * 4]; \ + void *cursor = buf; \ + char filename[strlen("metadataN") + 1]; \ + sprintf(filename, "metadata%d", N); \ + bytePut64(&cursor, FORMAT); \ + bytePut64(&cursor, VERSION); \ + bytePut64(&cursor, TERM); \ + bytePut64(&cursor, VOTED_FOR); \ + DirWriteFile(f->dir, filename, buf, sizeof buf); \ + } + +#define LONG_DIR \ + "/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" \ + "/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" \ + "/ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc" \ + "/ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd" \ + "/eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee" \ + "/fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" \ + "/ggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg" \ + "/hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh" \ + "/iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii" \ + "/jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj" \ + "/kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk" \ + "/lllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllll" \ + "/mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm" + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_UV_DEPS; + f->io.data = f; + f->closed = false; + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + if (f == NULL) { + return; + } + TEAR_DOWN_UV_DEPS; + free(f); +} + +/****************************************************************************** + * + * raft_io->init() + * + *****************************************************************************/ + +SUITE(init) + +TEST(init, dirTooLong, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_io io = {0}; + int rv; + rv = raft_uv_init(&io, &f->loop, LONG_DIR, &f->transport); + munit_assert_int(rv, ==, RAFT_NAMETOOLONG); + munit_assert_string_equal(io.errmsg, "directory path too long"); + return 0; +} + +/* Out of memory conditions upon probing for direct I/O. */ +TEST(init, probeDirectIoOom, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + /* XXX: tmpfs seems to not support O_DIRECT */ + struct statfs info; + int rv; + rv = statfs(f->dir, &info); + munit_assert_int(rv, ==, 0); + if (info.f_type == TMPFS_MAGIC) { + return MUNIT_SKIP; + } +#if defined(__powerpc64__) + /* XXX: fails on ppc64el */ + return MUNIT_SKIP; +#endif + HeapFaultConfig(&f->heap, 1 /* delay */, 1 /* repeat */); + HEAP_FAULT_ENABLE; + INIT_ERROR(f->dir, RAFT_NOMEM, "probe Direct I/O: out of memory"); + return 0; +} + +/* Out of memory conditions upon probing for async I/O. */ +TEST(init, probeAsyncIoOom, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + /* XXX: tmpfs seems to not support O_DIRECT */ + struct statfs info; + int rv; + rv = statfs(f->dir, &info); + munit_assert_int(rv, ==, 0); + if (info.f_type == TMPFS_MAGIC) { + return MUNIT_SKIP; + } +#if defined(__powerpc64__) + /* XXX: fails on ppc64el */ + return MUNIT_SKIP; +#endif + HeapFaultConfig(&f->heap, 2 /* delay */, 1 /* repeat */); + HEAP_FAULT_ENABLE; + INIT_ERROR(f->dir, RAFT_NOMEM, "probe Async I/O: out of memory"); + return 0; +} + +/* The given directory does not exist. */ +TEST(init, dirDoesNotExist, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + INIT_ERROR("/foo/bar/egg/baz", RAFT_NOTFOUND, + "directory '/foo/bar/egg/baz' does not exist"); + return MUNIT_OK; +} + +/* The given directory not accessible */ +TEST(init, dirNotAccessible, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + sprintf(errmsg, "directory '%s' is not writable", f->dir); + DirMakeUnexecutable(f->dir); + INIT_ERROR(f->dir, RAFT_INVALID, errmsg); + return MUNIT_OK; +} + +/* No space is left for probing I/O capabilities. */ +TEST(init, noSpace, setUp, tearDown, 0, DirTmpfsParams) +{ + struct fixture *f = data; + SKIP_IF_NO_FIXTURE; + DirFill(f->dir, 4); + INIT_ERROR(f->dir, RAFT_NOSPACE, + "create I/O capabilities probe file: not enough space to " + "allocate 4096 bytes"); + return MUNIT_OK; +} + +/* The metadata1 file has not the expected number of bytes. In this case the + * file is not considered at all, and the effect is as if this was a brand new + * server. */ +TEST(init, metadataOneTooShort, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint8_t buf[16] = {0}; + DirWriteFile(f->dir, "metadata1", buf, sizeof buf); + INIT(f->dir); + CLOSE; + return MUNIT_OK; +} + +/* The metadata1 file has not the expected format. */ +TEST(init, metadataOneBadFormat, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + WRITE_METADATA_FILE(1, /* Metadata file index */ + 2, /* Format */ + 1, /* Version */ + 1, /* Term */ + 0 /* Voted for */); + INIT_ERROR(f->dir, RAFT_MALFORMED, + "decode content of metadata1: bad format version 2"); + return MUNIT_OK; +} + +/* The metadata1 file has not a valid version. */ +TEST(init, metadataOneBadVersion, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + WRITE_METADATA_FILE(1, /* Metadata file index */ + 1, /* Format */ + 0, /* Version */ + 1, /* Term */ + 0 /* Voted for */); + INIT_ERROR(f->dir, RAFT_CORRUPT, + "decode content of metadata1: version is set to zero"); + return MUNIT_OK; +} + +/* The data directory has both metadata files, but they have the same + * version. */ +TEST(init, metadataOneAndTwoSameVersion, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + WRITE_METADATA_FILE(1, /* Metadata file index */ + 1, /* Format */ + 2, /* Version */ + 3, /* Term */ + 0 /* Voted for */); + WRITE_METADATA_FILE(2, /* Metadata file index */ + 1, /* Format */ + 2, /* Version */ + 2, /* Term */ + 0 /* Voted for */); + INIT_ERROR(f->dir, RAFT_CORRUPT, + "metadata1 and metadata2 are both at version 2"); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_uv_load.c b/test/raft/integration/test_uv_load.c new file mode 100644 index 000000000..5ef1e339a --- /dev/null +++ b/test/raft/integration/test_uv_load.c @@ -0,0 +1,1772 @@ +#include + +#include "../../../src/raft/byte.h" +#include "../../../src/raft/uv.h" +#include "../lib/runner.h" +#include "../lib/uv.h" + +/****************************************************************************** + * + * Fixture with a non-initialized libuv-based raft_io instance. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_UV_DEPS; + FIXTURE_UV; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +static void closeCb(struct raft_io *io) +{ + bool *done = io->data; + *done = true; +} + +static void appendCb(struct raft_io_append *req, int status) +{ + bool *done = req->data; + munit_assert_int(status, ==, 0); + *done = true; +} + +static void snapshotPutCb(struct raft_io_snapshot_put *req, int status) +{ + bool *done = req->data; + munit_assert_int(status, ==, 0); + *done = true; +} + +struct snapshot +{ + raft_term term; + raft_index index; + uint64_t data; +}; + +#define WORD_SIZE 8 + +/* Maximum number of blocks a segment can have */ +#define MAX_SEGMENT_BLOCKS 4 + +/* This block size should work fine for all file systems. */ +#define SEGMENT_BLOCK_SIZE 4096 + +/* Desired segment size */ +#define SEGMENT_SIZE SEGMENT_BLOCK_SIZE *MAX_SEGMENT_BLOCKS + +#define CLOSED_SEGMENT_FILENAME(START, END) \ + "000000000000000" #START \ + "-" \ + "000000000000000" #END + +/* Check if open segment file exists. */ +#define HAS_OPEN_SEGMENT_FILE(COUNT) DirHasFile(f->dir, "open-" #COUNT) + +/* Check if closed segment file exists. */ +#define HAS_CLOSED_SEGMENT_FILE(START, END) \ + DirHasFile(f->dir, CLOSED_SEGMENT_FILENAME(START, END)) + +/* Initialize a standalone raft_io instance and use it to append N batches of + * entries, each containing one entry. DATA should be an integer that will be + * used as base value for the data of the first entry, and will be then + * incremented for subsequent entries. */ +#define APPEND(N, DATA) \ + do { \ + struct raft_uv_transport _transport; \ + struct raft_io _io; \ + raft_term _term; \ + raft_id _voted_for; \ + struct raft_snapshot *_snapshot; \ + raft_index _start_index; \ + struct raft_entry *_entries; \ + size_t _i; \ + size_t _n; \ + void *_batch = NULL; \ + struct raft_entry _new_entry; \ + uint64_t _new_entry_data; \ + uint64_t _data = DATA; \ + struct raft_io_append _req; \ + bool _done = false; \ + int _rv; \ + \ + /* Initialize the instance, loading existing data, but discarding \ + * it. This makes sure that the start index is correctly set. */ \ + _transport.version = 1; \ + _rv = raft_uv_tcp_init(&_transport, &f->loop); \ + munit_assert_int(_rv, ==, 0); \ + _rv = raft_uv_init(&_io, &f->loop, f->dir, &_transport); \ + munit_assert_int(_rv, ==, 0); \ + _rv = _io.init(&_io, 1, "1"); \ + munit_assert_int(_rv, ==, 0); \ + raft_uv_set_block_size(&_io, SEGMENT_BLOCK_SIZE); \ + raft_uv_set_segment_size(&_io, SEGMENT_SIZE); \ + _rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \ + &_entries, &_n); \ + munit_assert_int(_rv, ==, 0); \ + for (_i = 0; _i < _n; _i++) { \ + struct raft_entry *_entry = &_entries[_i]; \ + if (_entry->batch != _batch) { \ + _batch = _entry->batch; \ + raft_free(_batch); \ + } \ + } \ + if (_entries != NULL) { \ + raft_free(_entries); \ + } \ + if (_snapshot != NULL) { \ + raft_configuration_close(&_snapshot->configuration); \ + munit_assert_int(_snapshot->n_bufs, ==, 1); \ + raft_free(_snapshot->bufs[0].base); \ + raft_free(_snapshot->bufs); \ + raft_free(_snapshot); \ + } \ + \ + /* Append the new entries. */ \ + for (_i = 0; _i < N; _i++) { \ + struct raft_entry *entry = &_new_entry; \ + entry->term = 1; \ + entry->type = RAFT_COMMAND; \ + entry->buf.base = &_new_entry_data; \ + entry->buf.len = sizeof _new_entry_data; \ + entry->batch = NULL; \ + munit_assert_ptr_not_null(entry->buf.base); \ + memset(entry->buf.base, 0, entry->buf.len); \ + *(uint64_t *)entry->buf.base = _data; \ + _data++; \ + _req.data = &_done; \ + _rv = _io.append(&_io, &_req, entry, 1, appendCb); \ + munit_assert_int(_rv, ==, 0); \ + LOOP_RUN_UNTIL(&_done); \ + _done = false; \ + } \ + \ + /* Shutdown the standalone raft_io instance. */ \ + _done = false; \ + _io.data = &_done; \ + _io.close(&_io, closeCb); \ + LOOP_RUN_UNTIL(&_done); \ + raft_uv_close(&_io); \ + raft_uv_tcp_close(&_transport); \ + } while (0); + +/* Initialize a standalone raft_io instance and use it to persist a new snapshot + * at the given INDEX and TERM. DATA should be an integer that will be used as + * as snapshot content. */ +#define SNAPSHOT_PUT(TERM, INDEX, DATA) \ + do { \ + struct raft_uv_transport _transport; \ + struct raft_io _io; \ + raft_term _term; \ + raft_id _voted_for; \ + struct raft_snapshot *_snapshot; \ + raft_index _start_index; \ + struct raft_entry *_entries; \ + size_t _i; \ + size_t _n; \ + void *_batch = NULL; \ + struct raft_snapshot _new_snapshot; \ + struct raft_buffer _new_snapshot_buf; \ + uint64_t _new_snapshot_data = DATA; \ + struct raft_io_snapshot_put _req; \ + bool _done = false; \ + int _rv; \ + \ + /* Initialize the instance, loading existing data, but discarding \ + * it. This makes sure that the start index is correctly set. */ \ + _transport.version = 1; \ + _rv = raft_uv_tcp_init(&_transport, &f->loop); \ + munit_assert_int(_rv, ==, 0); \ + _rv = raft_uv_init(&_io, &f->loop, f->dir, &_transport); \ + munit_assert_int(_rv, ==, 0); \ + _rv = _io.init(&_io, 1, "1"); \ + munit_assert_int(_rv, ==, 0); \ + raft_uv_set_block_size(&_io, SEGMENT_BLOCK_SIZE); \ + raft_uv_set_segment_size(&_io, SEGMENT_SIZE); \ + _rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \ + &_entries, &_n); \ + munit_assert_int(_rv, ==, 0); \ + for (_i = 0; _i < _n; _i++) { \ + struct raft_entry *_entry = &_entries[_i]; \ + if (_entry->batch != _batch) { \ + _batch = _entry->batch; \ + raft_free(_batch); \ + } \ + } \ + if (_entries != NULL) { \ + raft_free(_entries); \ + } \ + if (_snapshot != NULL) { \ + raft_configuration_close(&_snapshot->configuration); \ + munit_assert_int(_snapshot->n_bufs, ==, 1); \ + raft_free(_snapshot->bufs[0].base); \ + raft_free(_snapshot->bufs); \ + raft_free(_snapshot); \ + } \ + \ + /* Persist the new snapshot. */ \ + _new_snapshot.index = INDEX; \ + _new_snapshot.term = TERM; \ + raft_configuration_init(&_new_snapshot.configuration); \ + _rv = raft_configuration_add(&_new_snapshot.configuration, 1, "1", \ + RAFT_VOTER); \ + munit_assert_int(_rv, ==, 0); \ + _new_snapshot.bufs = &_new_snapshot_buf; \ + _new_snapshot.n_bufs = 1; \ + _new_snapshot_buf.base = &_new_snapshot_data; \ + _new_snapshot_buf.len = sizeof _new_snapshot_data; \ + _req.data = &_done; \ + _rv = \ + _io.snapshot_put(&_io, 10, &_req, &_new_snapshot, snapshotPutCb); \ + munit_assert_int(_rv, ==, 0); \ + LOOP_RUN_UNTIL(&_done); \ + raft_configuration_close(&_new_snapshot.configuration); \ + \ + /* Shutdown the standalone raft_io instance. */ \ + _done = false; \ + _io.data = &_done; \ + _io.close(&_io, closeCb); \ + LOOP_RUN_UNTIL(&_done); \ + raft_uv_close(&_io); \ + raft_uv_tcp_close(&_transport); \ + } while (0); + +/* Forcibly turn a closed segment into an open one, by renaming the underlying + * file and growing its size. */ +#define UNFINALIZE(FIRST_INDEX, LAST_INDEX, COUNTER) \ + do { \ + const char *_filename1 = \ + CLOSED_SEGMENT_FILENAME(FIRST_INDEX, LAST_INDEX); \ + char _filename2[64]; \ + sprintf(_filename2, "open-%u", (unsigned)COUNTER); \ + munit_assert_true(DirHasFile(f->dir, _filename1)); \ + munit_assert_false(DirHasFile(f->dir, _filename2)); \ + DirRenameFile(f->dir, _filename1, _filename2); \ + DirGrowFile(f->dir, _filename2, SEGMENT_SIZE); \ + } while (0) + +#define LOAD_VARS \ + int _rv; \ + raft_term _term; \ + raft_id _voted_for; \ + struct raft_snapshot *_snapshot; \ + raft_index _start_index; \ + struct raft_entry *_entries; \ + size_t _n; + +/* Initialize the raft_io instance, then call raft_io->load() and assert that it + * returns the given error code and message. */ +#define LOAD_ERROR(RV, ERRMSG) \ + do { \ + LOAD_VARS; \ + SETUP_UV; \ + _rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, \ + &_start_index, &_entries, &_n); \ + munit_assert_int(_rv, ==, RV); \ + munit_assert_string_equal(f->io.errmsg, ERRMSG); \ + } while (0) + +#define LOAD_ERROR_NO_SETUP(RV, ERRMSG) \ + do { \ + LOAD_VARS; \ + _rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, \ + &_start_index, &_entries, &_n); \ + munit_assert_int(_rv, ==, RV); \ + munit_assert_string_equal(f->io.errmsg, ERRMSG); \ + } while (0) + +#define LOAD_ERROR_NO_RECOVER(RV, ERRMSG) \ + do { \ + LOAD_VARS; \ + SETUP_UV; \ + _rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, \ + &_start_index, &_entries, &_n); \ + munit_assert_int(_rv, ==, RV); \ + munit_assert_string_equal(f->io.errmsg, ERRMSG); \ + } while (0) + +#define _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES) \ + _rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, &_start_index, \ + &_entries, &_n); \ + munit_assert_int(_rv, ==, 0); \ + munit_assert_int(_term, ==, TERM); \ + munit_assert_int(_voted_for, ==, VOTED_FOR); \ + munit_assert_int(_start_index, ==, START_INDEX); \ + if (_snapshot != NULL) { \ + struct snapshot *_expected = (struct snapshot *)(SNAPSHOT); \ + munit_assert_ptr_not_null(_snapshot); \ + munit_assert_int(_snapshot->term, ==, _expected->term); \ + munit_assert_int(_snapshot->index, ==, _expected->index); \ + munit_assert_int(_snapshot->n_bufs, ==, 1); \ + munit_assert_int(*(uint64_t *)_snapshot->bufs[0].base, ==, \ + _expected->data); \ + raft_configuration_close(&_snapshot->configuration); \ + raft_free(_snapshot->bufs[0].base); \ + raft_free(_snapshot->bufs); \ + raft_free(_snapshot); \ + } \ + if (_n != 0) { \ + munit_assert_int(_n, ==, N_ENTRIES); \ + for (_i = 0; _i < _n; _i++) { \ + struct raft_entry *_entry = &_entries[_i]; \ + uint64_t _value = *(uint64_t *)_entry->buf.base; \ + munit_assert_int(_value, ==, _data); \ + _data++; \ + } \ + for (_i = 0; _i < _n; _i++) { \ + struct raft_entry *_entry = &_entries[_i]; \ + if (_entry->batch != _batch) { \ + _batch = _entry->batch; \ + raft_free(_batch); \ + } \ + } \ + raft_free(_entries); \ + } + +/* Initialize the raft_io instance, then invoke raft_io->load() and assert that + * it returns the given state. If non-NULL, SNAPSHOT points to a struct snapshot + * object whose attributes must match the loaded snapshot. ENTRIES_DATA is + * supposed to be the integer stored in the data of first loaded entry. */ +#define LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, ENTRIES_DATA, N_ENTRIES) \ + do { \ + LOAD_VARS; \ + void *_batch = NULL; \ + uint64_t _data = ENTRIES_DATA; \ + unsigned _i; \ + SETUP_UV; \ + _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES) \ + } while (0) + +/* Same as LOAD but with auto recovery turned on. */ +#define LOAD_WITH_AUTO_RECOVERY(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, \ + ENTRIES_DATA, N_ENTRIES) \ + do { \ + LOAD_VARS; \ + void *_batch = NULL; \ + uint64_t _data = ENTRIES_DATA; \ + unsigned _i; \ + SETUP_UV; \ + raft_uv_set_auto_recovery(&f->io, true); \ + _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES) \ + } while (0) + +/* Same as LOAD without SETUP_UV */ +#define LOAD_NO_SETUP(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, ENTRIES_DATA, \ + N_ENTRIES) \ + do { \ + LOAD_VARS; \ + void *_batch = NULL; \ + uint64_t _data = ENTRIES_DATA; \ + unsigned _i; \ + _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES) \ + } while (0) + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_UV_DEPS; + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_UV; + TEAR_DOWN_UV_DEPS; + free(f); +} + +/****************************************************************************** + * + * raft_io->load() + * + *****************************************************************************/ + +SUITE(load) + +/* Load the initial state of a pristine server. */ +TEST(load, emptyDir, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + LOAD(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + return MUNIT_OK; +} + +static char *unknownFiles[] = { + "garbage", + "0000000000000000000000000001-00000000001garbage", + "open-1garbage", + NULL, +}; + +static MunitParameterEnum unknownFilesParams[] = { + {"filename", unknownFiles}, + {NULL, NULL}, +}; + +/* Files that are not part of the raft state are ignored. */ +TEST(load, ignoreUnknownFiles, setUp, tearDown, 0, unknownFilesParams) +{ + struct fixture *f = data; + const char *filename = munit_parameters_get(params, "filename"); + DirWriteFileWithZeros(f->dir, filename, 128); + LOAD(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + return MUNIT_OK; +} + +static char *unusableFiles[] = {"tmp-0000000001221212-0000000001221217", + "tmp-snapshot-15-8260687-512469866", + "snapshot-525-43326736-880259052", + "snapshot-999-13371337-880259052.meta", + "snapshot-20-8260687-512469866", + "snapshot-88-8260687-512469866.meta", + "snapshot-88-8260999-512469866.meta", + "tmp-snapshot-88-8260999-512469866.meta", + "tmp-snapshot-33-8260687-512469866", + "snapshot-33-8260687-512469866.meta", + "tmp-metadata1", + "tmp-metadata2", + "tmp-open1", + "tmp-open13", + NULL}; + +static MunitParameterEnum unusableFilesParams[] = { + {"filename", unusableFiles}, + {NULL, NULL}, +}; + +/* Files that can no longer be used are removed. */ +TEST(load, removeUnusableFiles, setUp, tearDown, 0, unusableFilesParams) +{ + struct fixture *f = data; + const char *filename = munit_parameters_get(params, "filename"); + DirWriteFileWithZeros(f->dir, filename, 128); + munit_assert_true(DirHasFile(f->dir, filename)); + LOAD(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + munit_assert_false(DirHasFile(f->dir, filename)); + return MUNIT_OK; +} + +/* The data directory has an empty open segment. */ +TEST(load, emptyOpenSegment, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + DirWriteFile(f->dir, "open-1", NULL, 0); + LOAD(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + /* The empty segment has been removed. */ + munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); + return MUNIT_OK; +} + +/* The data directory has a freshly allocated open segment filled with zeros. */ +TEST(load, openSegmentWithTrailingZeros, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + DirWriteFileWithZeros(f->dir, "open-1", 256); + LOAD(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + /* The empty segment has been removed. */ + munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); + return MUNIT_OK; +} + +/* The data directory has a valid closed and open segments. */ +TEST(load, bothOpenAndClosedSegments, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(2, 1); + APPEND(1, 3); + APPEND(1, 4); + UNFINALIZE(4, 4, 1); + LOAD(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 1, /* data for first loaded entry */ + 4 /* n entries */ + ); + return MUNIT_OK; +} + +/* The data directory has an allocated open segment which contains non-zero + * corrupted data in its second batch. */ +TEST(load, openSegmentWithNonZeroData, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint64_t corrupt = 123456789; + APPEND(2, 1); + UNFINALIZE(1, 2, 1); + DirOverwriteFile(f->dir, "open-1", &corrupt, sizeof corrupt, 60); + LOAD(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 1, /* data for first loaded entry */ + 1 /* n entries */ + ); + + /* The segment has been removed. */ + munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); + + return MUNIT_OK; +} + +/* The data directory has an open segment with a partially written batch that + * needs to be truncated. */ +TEST(load, openSegmentWithIncompleteBatch, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint8_t zero[256]; + APPEND(2, 1); + UNFINALIZE(1, 2, 1); + memset(zero, 0, sizeof zero); + DirOverwriteFile(f->dir, "open-1", &zero, sizeof zero, 62); + LOAD(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 1, /* data for first loaded entry */ + 1 /* n entries */ + ); + return MUNIT_OK; +} + +/* The data directory has an open segment whose first batch is only + * partially written. In that case the segment gets removed. */ +TEST(load, openSegmentWithIncompleteFirstBatch, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint8_t buf[4 * WORD_SIZE] = { + 1, 0, 0, 0, 0, 0, 0, 0, /* Format version */ + 0, 0, 0, 0, 0, 0, 0, 0, /* CRC32 checksums */ + 0, 0, 0, 0, 0, 0, 0, 0, /* Number of entries */ + 0, 0, 0, 0, 0, 0, 0, 0 /* Batch data */ + }; + APPEND(1, 1); + UNFINALIZE(1, 1, 1); + + DirOverwriteFile(f->dir, "open-1", buf, sizeof buf, 0); + + LOAD(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + + return MUNIT_OK; +} + +/* The data directory has two segments, with the second having an entry. */ +TEST(load, twoOpenSegments, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1, 1); + APPEND(1, 2); + UNFINALIZE(1, 1, 1); + UNFINALIZE(2, 2, 2); + + LOAD(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 1, /* data for first loaded entry */ + 2 /* n entries */ + ); + + /* The first and second segments have been renamed. */ + munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); + munit_assert_false(HAS_OPEN_SEGMENT_FILE(2)); + munit_assert_true(HAS_CLOSED_SEGMENT_FILE(1, 1)); + munit_assert_true(HAS_CLOSED_SEGMENT_FILE(2, 2)); + + return MUNIT_OK; +} + +/* The data directory has two open segments, with the second one filled with + * zeros. */ +TEST(load, secondOpenSegmentIsAllZeros, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1, 1); + UNFINALIZE(1, 1, 1); + DirWriteFileWithZeros(f->dir, "open-2", SEGMENT_SIZE); + + LOAD(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 1, /* data for first loaded entry */ + 1 /* n entries */ + ); + + /* The first segment has been renamed. */ + munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); + munit_assert_true(HAS_CLOSED_SEGMENT_FILE(1, 1)); + + /* The second segment has been removed. */ + munit_assert_false(HAS_OPEN_SEGMENT_FILE(2)); + + return MUNIT_OK; +} + +/* The data directory has two open segments, the first one has a corrupt header + * and auto-recovery is on. */ +TEST(load, twoOpenSegmentsFirstCorruptAutoRecovery, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1, 1); + UNFINALIZE(1, 1, 1); + DirWriteFileWithZeros(f->dir, "open-2", SEGMENT_SIZE); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + /* Load is successful and equals pristine condition. */ + LOAD_WITH_AUTO_RECOVERY(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + + /* The open segments are renamed, and there is no closed segment. */ + munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); + munit_assert_false(HAS_OPEN_SEGMENT_FILE(2)); + munit_assert_false(HAS_CLOSED_SEGMENT_FILE(1, 1)); + + return MUNIT_OK; +} + +/* The data directory has two open segments, the first one has a corrupt header. + */ +TEST(load, twoOpenSegmentsFirstCorrupt, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1, 1); + UNFINALIZE(1, 1, 1); + DirWriteFileWithZeros(f->dir, "open-2", SEGMENT_SIZE); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + LOAD_ERROR(RAFT_CORRUPT, + "load open segment open-1: unexpected format version 0"); + + /* The open segments are renamed, and there is no closed segment. */ + munit_assert_true(HAS_OPEN_SEGMENT_FILE(1)); + munit_assert_true(HAS_OPEN_SEGMENT_FILE(2)); + return MUNIT_OK; +} + +/* The data directory has a valid open segment. */ +TEST(load, openSegment, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1, 1); + UNFINALIZE(1, 1, 1); + LOAD(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 1, /* data for first loaded entry */ + 1 /* n entries */ + ); + return MUNIT_OK; +} + +/* There is exactly one snapshot and no segments. */ +TEST(load, onlyOneSnapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 1, /* index */ + 1 /* data */ + }; + SNAPSHOT_PUT(1, 1, 1); + LOAD(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 2, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + return MUNIT_OK; +} + +/* There are several snapshots, including an incomplete one. The last one is + * loaded and the incomplete or older ones are removed. */ +TEST(load, manySnapshots, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 2, /* term */ + 9, /* index */ + 4 /* data */ + }; + char filename[64]; + uint64_t now; + + /* Take a snapshot but then remove the data file, as if the server crashed + * before it could complete writing it. */ + uv_update_time(&f->loop); + now = uv_now(&f->loop); + sprintf(filename, "snapshot-1-8-%ju", now); + SNAPSHOT_PUT(1, 8, 1); + DirRemoveFile(f->dir, filename); + + SNAPSHOT_PUT(1, 8, 2); + SNAPSHOT_PUT(2, 6, 3); + SNAPSHOT_PUT(2, 9, 4); + LOAD(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 10, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + + /* The orphaned .meta file is removed */ + char meta_filename[128]; + sprintf(meta_filename, "%s%s", filename, UV__SNAPSHOT_META_SUFFIX); + munit_assert_false(DirHasFile(f->dir, meta_filename)); + + return MUNIT_OK; +} + +/* There are two snapshots, but the last one has an empty data file. The first + * one is loaded and the empty one is discarded. */ +TEST(load, emptySnapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 4, /* index */ + 1 /* data */ + }; + char filename[64]; + uint64_t now; + + SNAPSHOT_PUT(1, 4, 1); + + /* Take a snapshot but then truncate the data file, as if the server ran out + * of space before it could write it. */ + uv_update_time(&f->loop); + now = uv_now(&f->loop); + sprintf(filename, "snapshot-2-6-%ju", now); + SNAPSHOT_PUT(2, 6, 2); + DirTruncateFile(f->dir, filename, 0); + + LOAD(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 5, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + + return MUNIT_OK; +} + +/* There is an orphaned snapshot and an orphaned snapshot .meta file, + * make sure they are removed */ +TEST(load, orphanedSnapshotFiles, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uv_update_time(&f->loop); + uint64_t now = uv_now(&f->loop); + + struct snapshot expected_snapshot = { + 2, /* term */ + 16, /* index */ + 4 /* data */ + }; + + char filename1_removed[64]; + char metafilename1_removed[64]; + char filename2_removed[64]; + char metafilename2_removed[64]; + + /* Take a snapshot but then remove the data file, as if the server crashed + * before it could complete writing it. */ + sprintf(filename1_removed, "snapshot-2-18-%ju", now); + sprintf(metafilename1_removed, "snapshot-2-18-%ju%s", now, + UV__SNAPSHOT_META_SUFFIX); + SNAPSHOT_PUT(2, 18, 1); + munit_assert_true(DirHasFile(f->dir, filename1_removed)); + munit_assert_true(DirHasFile(f->dir, metafilename1_removed)); + DirRemoveFile(f->dir, filename1_removed); + + /* Take a snapshot but then remove the .meta file */ + now = uv_now(&f->loop); + sprintf(filename2_removed, "snapshot-2-19-%ju", now); + sprintf(metafilename2_removed, "snapshot-2-19-%ju%s", now, + UV__SNAPSHOT_META_SUFFIX); + SNAPSHOT_PUT(2, 19, 2); + munit_assert_true(DirHasFile(f->dir, filename2_removed)); + munit_assert_true(DirHasFile(f->dir, metafilename2_removed)); + DirRemoveFile(f->dir, metafilename2_removed); + + /* Take a valid snapshot and make sure it's loaded */ + SNAPSHOT_PUT(2, 16, 4); + LOAD(0, /* term */ + 0, /* voted for */ + &expected_snapshot, /* snapshot */ + 17, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + + /* The orphaned files are removed */ + munit_assert_false(DirHasFile(f->dir, metafilename1_removed)); + munit_assert_false(DirHasFile(f->dir, filename2_removed)); + return MUNIT_OK; +} + +/* The data directory has a closed segment with entries that are no longer + * needed, since they are included in a snapshot. We still keep those segments + * and just let the next snapshot logic delete them. */ +TEST(load, closedSegmentWithEntriesBehindSnapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 2, /* index */ + 1 /* data */ + }; + APPEND(1, 1); + SNAPSHOT_PUT(1, 2, 1); + LOAD(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 3, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + munit_assert_true(HAS_CLOSED_SEGMENT_FILE(1, 1)); + return MUNIT_OK; +} + +/* The data directory has a closed segment with entries that are no longer + * needed, since they are included in a snapshot. However it also has an open + * segment that has enough entries to reach the snapshot last index. */ +TEST(load, openSegmentWithEntriesPastSnapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 2, /* index */ + 1 /* data */ + }; + APPEND(1, 1); + APPEND(1, 2); + SNAPSHOT_PUT(1, 2, 1); + UNFINALIZE(2, 2, 1); + LOAD(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 1, /* start index */ + 1, /* data for first loaded entry */ + 2 /* n entries */ + ); + return MUNIT_OK; +} + +/* The data directory has a closed segment whose filename encodes a number of + * entries which is different then ones it actually contains. */ +TEST(load, closedSegmentWithInconsistentFilename, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(3, 1); + DirRenameFile(f->dir, "0000000000000001-0000000000000003", + "0000000000000001-0000000000000004"); + LOAD_ERROR(RAFT_CORRUPT, + "load closed segment 0000000000000001-0000000000000004: found 3 " + "entries (expected 4)"); + return MUNIT_OK; +} + +/* The data directory has a closed segment whose filename encodes a number of + * entries which is different then ones it actually contains, and auto-recovery + * is turned on. */ +TEST(load, + closedSegmentWithInconsistentFilenameAutoRecovery, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + APPEND(3, 1); + DirRenameFile(f->dir, "0000000000000001-0000000000000003", + "0000000000000001-0000000000000004"); + /* Load in pristine condition */ + LOAD_WITH_AUTO_RECOVERY(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + return MUNIT_OK; +} + +/* The data directory has a closed segment with entries that are no longer + * needed, since they are included in a snapshot. It also has an open segment, + * however that does not have enough entries to reach the snapshot last + * index. */ +TEST(load, openSegmentWithEntriesBehindSnapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1, 1); + APPEND(1, 2); + SNAPSHOT_PUT(1, 3, 1); + UNFINALIZE(2, 2, 1); + LOAD_ERROR(RAFT_CORRUPT, + "last entry on disk has index 2, which is behind last " + "snapshot's index 3"); + return MUNIT_OK; +} + +/* The data directory has a closed segment with entries that are no longer + * needed, since they are included in a snapshot. It also has an open segment, + * however that does not have enough entries to reach the snapshot last + * index, and auto-receovery is turned on. */ +TEST(load, + openSegmentWithEntriesBehindSnapshotAutoRecovery, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 3, /* index */ + 1 /* data */ + }; + APPEND(1, 1); + APPEND(1, 2); + SNAPSHOT_PUT(1, 3, 1); + UNFINALIZE(2, 2, 1); + LOAD_WITH_AUTO_RECOVERY(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 4, /* start index */ + 0, /* data for first loaded entry */ + 0 /* n entries */ + ); + return MUNIT_OK; +} + +/* The data directory contains a snapshot and an open segment containing a valid + * entry, and no closed segments. */ +TEST(load, openSegmentNoClosedSegmentsSnapshotPresent, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 3, /* index */ + 1 /* data */ + }; + SNAPSHOT_PUT(1, 3, 1); + APPEND(1, 4); + UNFINALIZE(4, 4, 1); + LOAD(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 4, /* start index */ + 4, /* data for first loaded entry */ + 1 /* n entries */ + ); + return MUNIT_OK; +} + +/* The data directory contains a snapshot and an open segment with a corrupt + * format header and no closed segments. */ +TEST(load, + corruptOpenSegmentNoClosedSegmentsSnapshotPresent, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + SNAPSHOT_PUT(1, 3, 1); + APPEND(1, 4); + UNFINALIZE(4, 4, 1); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + LOAD_ERROR(RAFT_CORRUPT, + "load open segment open-1: unexpected format version 0"); + return MUNIT_OK; +} + +/* The data directory contains a snapshot and an open segment with a corrupt + * format header and no closed segments. Auto-recovery is turned on. */ +TEST(load, + corruptOpenSegmentNoClosedSegmentsSnapshotPresentWithAutoRecovery, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 3, /* index */ + 1 /* data */ + }; + SNAPSHOT_PUT(1, 3, 1); + APPEND(1, 4); + UNFINALIZE(4, 4, 1); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + /* Load is successful. */ + LOAD_WITH_AUTO_RECOVERY(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 4, /* start index */ + 1, /* data for first loaded entry */ + 1 /* n entries */ + ); + return MUNIT_OK; +} + +/* The data directory contains a snapshot and an open segment with a corrupt + * format header and a closed segment. */ +TEST(load, + corruptOpenSegmentClosedSegmentSnapshotPresent, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + SNAPSHOT_PUT(1, 3, 1); + APPEND(1, 4); + APPEND(1, 5); + UNFINALIZE(5, 5, 1); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + LOAD_ERROR(RAFT_CORRUPT, + "load open segment open-1: unexpected format version 0"); + return MUNIT_OK; +} + +/* The data directory contains a snapshot and an open segment with a corrupt + * format header and a closed segment. Auto-recovery is turned on. */ +TEST(load, + corruptOpenSegmentClosedSegmentSnapshotPresentWithAutoRecovery, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 3, /* index */ + 1 /* data */ + }; + SNAPSHOT_PUT(1, 3, 1); + APPEND(1, 4); + APPEND(1, 5); + UNFINALIZE(5, 5, 1); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + + /* Load is successful. */ + LOAD_WITH_AUTO_RECOVERY(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 4, /* start index */ + 4, /* data for first loaded entry */ + 1 /* n entries */ + ); + + /* Open segment has been renamed */ + munit_assert_false(DirHasFile(f->dir, "open-1")); + return MUNIT_OK; +} + +/* The data directory contains a snapshot and an open segment with a corrupt + * format header and multiple closed segment. Auto-recovery is turned on. */ +TEST(load, + corruptOpenSegmentClosedSegmentsSnapshotPresentWithAutoRecovery, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 3, /* index */ + 1 /* data */ + }; + SNAPSHOT_PUT(1, 3, 1); + APPEND(1, 4); + APPEND(1, 5); + APPEND(1, 6); + UNFINALIZE(6, 6, 1); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + + LOAD_WITH_AUTO_RECOVERY(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 4, /* start index */ + 4, /* data for first loaded entry */ + 2 /* n entries */ + ); + /* Open segment has been renamed during the first load */ + munit_assert_false(DirHasFile(f->dir, "open-1")); + return MUNIT_OK; +} + +/* The data directory contains a snapshot and an open segment with a corrupt + * format header and multiple closed segment. */ +TEST(load, + corruptOpenSegmentClosedSegmentsSnapshotPresent, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + SNAPSHOT_PUT(1, 3, 1); + APPEND(1, 4); + APPEND(1, 5); + APPEND(1, 6); + UNFINALIZE(6, 6, 1); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + LOAD_ERROR(RAFT_CORRUPT, + "load open segment open-1: unexpected format version 0"); + return MUNIT_OK; +} + +/* The data directory contains a closed segment and an open segment with a + * corrupt format header and no snapshot. */ +TEST(load, corruptOpenSegmentClosedSegments, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(4, 1); + APPEND(1, 5); + UNFINALIZE(5, 5, 1); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + LOAD_ERROR(RAFT_CORRUPT, + "load open segment open-1: unexpected format version 0"); + return MUNIT_OK; +} + +/* The data directory contains a closed segment and an open segment with a + * corrupt format header and no snapshot. Auto-recovery is turned on. */ +TEST(load, + corruptOpenSegmentClosedSegmentsWithAutoRecovery, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + APPEND(4, 1); + APPEND(1, 5); + UNFINALIZE(5, 5, 1); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + /* load is successful. */ + LOAD_WITH_AUTO_RECOVERY(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 1, /* data for first loaded entry */ + 4 /* n entries */ + ); + /* Open segment has been renamed */ + munit_assert_false(DirHasFile(f->dir, "open-1")); + return MUNIT_OK; +} + +/* The data directory contains a closed segment and two open segments. + * The first open segment has a corrupt header. */ +TEST(load, corruptOpenSegmentsClosedSegments, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(3, 1); + APPEND(1, 4); + APPEND(1, 5); + UNFINALIZE(4, 4, 1); + UNFINALIZE(5, 5, 2); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + LOAD_ERROR(RAFT_CORRUPT, + "load open segment open-1: unexpected format version 0"); + + return MUNIT_OK; +} + +/* The data directory contains a closed segment and two open segments. + * The first open segment has a corrupt header. Auto-recovery is turned on. */ +TEST(load, + corruptOpenSegmentsClosedSegmentsWithAutoRecovery, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + APPEND(3, 1); + APPEND(1, 4); + APPEND(1, 5); + UNFINALIZE(4, 4, 1); + UNFINALIZE(5, 5, 2); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + + LOAD_WITH_AUTO_RECOVERY(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 1, /* data for first loaded entry */ + 3 /* n entries */ + ); + + /* Open segments have been renamed */ + munit_assert_false(DirHasFile(f->dir, "open-1")); + munit_assert_false(DirHasFile(f->dir, "open-2")); + return MUNIT_OK; +} + +/* The data directory contains a closed segment and two open segments. + * The second open segment has a corrupt header. */ +TEST(load, corruptLastOpenSegmentClosedSegments, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(3, 1); + APPEND(1, 4); + APPEND(1, 5); + UNFINALIZE(4, 4, 1); + UNFINALIZE(5, 5, 2); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-2", &version, sizeof version, 0); + LOAD_ERROR(RAFT_CORRUPT, + "load open segment open-2: unexpected format version 0"); + + return MUNIT_OK; +} + +/* The data directory contains a closed segment and two open segments. + * The second open segment has a corrupt header. Auto-recovery is turned on. */ +TEST(load, + corruptLastOpenSegmentClosedSegmentsWithAutoRecovery, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + APPEND(3, 1); + APPEND(1, 4); + APPEND(1, 5); + UNFINALIZE(4, 4, 1); + UNFINALIZE(5, 5, 2); + + /* Corrupt open segment */ + uint64_t version = 0 /* Format version */; + DirOverwriteFile(f->dir, "open-2", &version, sizeof version, 0); + + LOAD_WITH_AUTO_RECOVERY(0, /* term */ + 0, /* voted for */ + NULL, /* snapshot */ + 1, /* start index */ + 1, /* data for first loaded entry */ + 4 /* n entries */ + ); + /* Open segment has been renamed during the first load */ + munit_assert_false(DirHasFile(f->dir, "open-2")); + return MUNIT_OK; +} + +/* The data directory has several closed segments, all with entries compatible + * with the snapshot. */ +TEST(load, closedSegmentsOverlappingWithSnapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 4, /* index */ + 1 /* data */ + }; + APPEND(1, 1); + APPEND(2, 2); + APPEND(3, 4); + SNAPSHOT_PUT(1, 4, 1); + LOAD(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 1, /* start index */ + 1, /* data for first loaded entry */ + 6 /* n entries */ + ); + return MUNIT_OK; +} + +/* The data directory has several closed segments, the last of which is corrupt. + * There is a snapshot. */ +TEST(load, + closedSegmentsWithSnapshotLastSegmentCorrupt, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + SNAPSHOT_PUT(1, 4, 1); + APPEND(1, 5); + APPEND(2, 6); + APPEND(2, 8); + + /* Corrupt the last closed segment */ + size_t offset = + WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */; + uint32_t corrupted = 123456789; + DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 9), &corrupted, + sizeof corrupted, offset); + LOAD_ERROR(RAFT_CORRUPT, + "load closed segment 0000000000000008-0000000000000009: entries " + "batch 1 starting at byte 8: data checksum mismatch"); + return MUNIT_OK; +} + +/* The data directory has several closed segments, the last of which is corrupt. + * There is a snapshot. Auto-recovery is turned on. */ +TEST(load, + closedSegmentsWithSnapshotLastSegmentCorruptAutoRecovery, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 4, /* index */ + 1 /* data */ + }; + SNAPSHOT_PUT(1, 4, 1); + APPEND(1, 5); + APPEND(2, 6); + APPEND(2, 8); + + /* Corrupt the last closed segment */ + size_t offset = + WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */; + uint32_t corrupted = 123456789; + DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 9), &corrupted, + sizeof corrupted, offset); + LOAD_WITH_AUTO_RECOVERY(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 5, /* start index */ + 5, /* data for first loaded entry */ + 3 /* n entries */ + ); + return MUNIT_OK; +} + +/* The data directory has several closed segments, the last of which is corrupt. + * There is an open segment and a snapshot. Auto-recovery is turned on. */ +TEST(load, + closedSegmentsWithSnapshotLastSegmentCorruptOpenSegmentWithAutoRecovery, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 4, /* index */ + 1 /* data */ + }; + SNAPSHOT_PUT(1, 4, 1); + APPEND(1, 5); + APPEND(2, 6); + APPEND(1, 8); + APPEND(1, 9); + UNFINALIZE(9, 9, 1); + + /* Corrupt the last closed segment */ + size_t offset = + WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */; + uint32_t corrupted = 123456789; + DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 8), &corrupted, + sizeof corrupted, offset); + munit_assert_true(HAS_OPEN_SEGMENT_FILE(1)); + + LOAD_WITH_AUTO_RECOVERY(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 5, /* start index */ + 5, /* data for first loaded entry */ + 3 /* n entries */ + ); + munit_assert_false(HAS_OPEN_SEGMENT_FILE(1)); + return MUNIT_OK; +} + +/* The data directory has several closed segments, the last of which is corrupt. + * There is an open segment and a snapshot. */ +TEST(load, + closedSegmentsWithSnapshotLastSegmentCorruptOpenSegment, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + SNAPSHOT_PUT(1, 4, 1); + APPEND(1, 5); + APPEND(2, 6); + APPEND(1, 8); + APPEND(1, 9); + UNFINALIZE(9, 9, 1); + + /* Corrupt the last closed segment */ + size_t offset = + WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */; + uint32_t corrupted = 123456789; + DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 8), &corrupted, + sizeof corrupted, offset); + munit_assert_true(HAS_OPEN_SEGMENT_FILE(1)); + LOAD_ERROR(RAFT_CORRUPT, + "load closed segment 0000000000000008-0000000000000008: entries " + "batch 1 starting at byte 8: data checksum mismatch"); + return MUNIT_OK; +} + +/* The data directory has several closed segments, the second to last one of + * which is corrupt. There is a snapshot. */ +TEST(load, + closedSegmentsWithSnapshotSecondLastSegmentCorrupt, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + SNAPSHOT_PUT(1, 4, 1); + APPEND(1, 5); + APPEND(2, 6); + APPEND(2, 8); + + /* Corrupt the second last closed segment */ + size_t offset = + WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */; + uint32_t corrupted = 123456789; + DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(6, 7), &corrupted, + sizeof corrupted, offset); + LOAD_ERROR(RAFT_CORRUPT, + "load closed segment 0000000000000006-0000000000000007: entries " + "batch 1 starting at byte 8: data checksum mismatch"); + + /* Second load still fails. */ + LOAD_ERROR_NO_SETUP( + RAFT_CORRUPT, + "load closed segment 0000000000000006-0000000000000007: entries " + "batch 1 starting at byte 8: data checksum mismatch"); + + return MUNIT_OK; +} + +/* The data directory has several closed segments, some of which have a gap, + * which is still compatible with the snapshot. */ +TEST(load, nonContiguousClosedSegments, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct snapshot snapshot = { + 1, /* term */ + 4, /* index */ + 1 /* data */ + }; + APPEND(1, 1); + APPEND(2, 2); + APPEND(3, 4); + SNAPSHOT_PUT(1, 4, 1); + DirRemoveFile(f->dir, CLOSED_SEGMENT_FILENAME(2, 3)); + LOAD(0, /* term */ + 0, /* voted for */ + &snapshot, /* snapshot */ + 4, /* start index */ + 4, /* data for first loaded entry */ + 3 /* n entries */ + ); + return MUNIT_OK; +} + +/* If the data directory has a closed segment whose start index is beyond the + * snapshot's last index, an error is returned. */ +TEST(load, closedSegmentWithEntriesPastSnapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint64_t now; + char errmsg[128]; + APPEND(5, 1); + APPEND(1, 5); + uv_update_time(&f->loop); + now = uv_now(&f->loop); + sprintf(errmsg, + "closed segment 0000000000000006-0000000000000006 is past last " + "snapshot snapshot-1-4-%ju", + now); + SNAPSHOT_PUT(1, 4, 1); + DirRemoveFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 5)); + LOAD_ERROR(RAFT_CORRUPT, errmsg); + return MUNIT_OK; +} + +/* The data directory has an open segment which has incomplete format data. */ +TEST(load, openSegmentWithIncompleteFormat, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + DirWriteFileWithZeros(f->dir, "open-1", WORD_SIZE / 2); + LOAD_ERROR(RAFT_IOERR, "load open segment open-1: file has only 4 bytes"); + return MUNIT_OK; +} + +/* The data directory has an open segment which has an incomplete batch + * preamble. */ +TEST(load, openSegmentWithIncompletePreamble, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + size_t offset = WORD_SIZE /* Format version */ + WORD_SIZE /* Checksums */; + APPEND(1, 1); + UNFINALIZE(1, 1, 1); + DirTruncateFile(f->dir, "open-1", offset); + LOAD_ERROR(RAFT_IOERR, + "load open segment open-1: entries batch 1 starting at byte 16: " + "read preamble: short read: 0 bytes instead of 8"); + return MUNIT_OK; +} + +/* The data directory has an open segment which has incomplete batch header. */ +TEST(load, openSegmentWithIncompleteBatchHeader, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + size_t offset = WORD_SIZE + /* Format version */ + WORD_SIZE + /* Checksums */ + WORD_SIZE + /* Number of entries */ + WORD_SIZE /* Partial batch header */; + + APPEND(1, 1); + UNFINALIZE(1, 1, 1); + DirTruncateFile(f->dir, "open-1", offset); + LOAD_ERROR(RAFT_IOERR, + "load open segment open-1: entries batch 1 starting at byte 8: " + "read header: short read: 8 bytes instead of 16"); + return MUNIT_OK; +} + +/* The data directory has an open segment which has incomplete batch data. */ +TEST(load, openSegmentWithIncompleteBatchData, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + size_t offset = WORD_SIZE + /* Format version */ + WORD_SIZE + /* Checksums */ + WORD_SIZE + /* Number of entries */ + WORD_SIZE + /* Entry term */ + WORD_SIZE + /* Entry type and data size */ + WORD_SIZE / 2 /* Partial entry data */; + + APPEND(1, 1); + UNFINALIZE(1, 1, 1); + DirTruncateFile(f->dir, "open-1", offset); + LOAD_ERROR(RAFT_IOERR, + "load open segment open-1: entries batch 1 starting at byte 8: " + "read data: short read: 4 bytes instead of 8"); + return MUNIT_OK; +} + +/* The data directory has a closed segment which has corrupted batch header. */ +TEST(load, closedSegmentWithCorruptedBatchHeader, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + size_t offset = WORD_SIZE /* Format version */; + uint64_t corrupted = 12345678; + APPEND(1, 1); + DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), &corrupted, + sizeof corrupted, offset); + LOAD_ERROR(RAFT_CORRUPT, + "load closed segment 0000000000000001-0000000000000001: entries " + "batch 1 starting at byte 8: header checksum mismatch"); + return MUNIT_OK; +} + +/* The data directory has a closed segment which has corrupted batch data. */ +TEST(load, closedSegmentWithCorruptedBatchData, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + size_t offset = + WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */; + uint32_t corrupted = 123456789; + APPEND(1, 1); + DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), &corrupted, + sizeof corrupted, offset); + LOAD_ERROR(RAFT_CORRUPT, + "load closed segment 0000000000000001-0000000000000001: entries " + "batch 1 starting at byte 8: data checksum mismatch"); + return MUNIT_OK; +} + +/* The data directory has a closed segment whose first index does not match what + * we expect. */ +TEST(load, closedSegmentWithBadIndex, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1, 1); + APPEND(1, 2); + DirRemoveFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1)); + LOAD_ERROR(RAFT_CORRUPT, + "unexpected closed segment 0000000000000002-0000000000000002: " + "first index should have been 1"); + return MUNIT_OK; +} + +/* The data directory has an empty closed segment. */ +TEST(load, emptyClosedSegment, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + DirWriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), NULL, 0); + LOAD_ERROR( + RAFT_CORRUPT, + "load closed segment 0000000000000001-0000000000000001: file is empty"); + return MUNIT_OK; +} + +/* The data directory has a closed segment with an unexpected format. */ +TEST(load, closedSegmentWithBadFormat, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint8_t buf[8] = {2, 0, 0, 0, 0, 0, 0, 0}; + DirWriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), buf, sizeof buf); + LOAD_ERROR(RAFT_CORRUPT, + "load closed segment 0000000000000001-0000000000000001: " + "unexpected format version 2"); + return MUNIT_OK; +} + +/* The data directory has an open segment which is not readable. */ +TEST(load, openSegmentWithNoAccessPermission, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + + /* Skip the test when running as root, since EACCES would not be triggered + * in that case. */ + if (getuid() == 0) { + SETUP_UV; /* Setup the uv object since teardown expects it. */ + return MUNIT_SKIP; + } + + APPEND(1, 1); + UNFINALIZE(1, 1, 1); + DirMakeFileUnreadable(f->dir, "open-1"); + LOAD_ERROR(RAFT_IOERR, + "load open segment open-1: read file: open: permission denied"); + return MUNIT_OK; +} + +/* The data directory has an open segment with format set to 0 and non-zero + * content. */ +TEST(load, openSegmentWithZeroFormatAndThenData, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint64_t version = 0 /* Format version */; + APPEND(1, 1); + UNFINALIZE(1, 1, 1); + DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0); + LOAD_ERROR(RAFT_CORRUPT, + "load open segment open-1: unexpected format version 0"); + return MUNIT_OK; +} + +/* The data directory has an open segment with an unexpected format. */ +TEST(load, openSegmentWithBadFormat, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint8_t version[8] = {2, 0, 0, 0, 0, 0, 0, 0}; + APPEND(1, 1); + UNFINALIZE(1, 1, 1); + DirOverwriteFile(f->dir, "open-1", version, sizeof version, 0); + LOAD_ERROR(RAFT_CORRUPT, + "load open segment open-1: unexpected format version 2"); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_uv_recover.c b/test/raft/integration/test_uv_recover.c new file mode 100644 index 000000000..f1435a656 --- /dev/null +++ b/test/raft/integration/test_uv_recover.c @@ -0,0 +1,80 @@ +#include "../lib/runner.h" +#include "../lib/uv.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_UV_DEPS; + FIXTURE_UV; +}; + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_UV_DEPS; + SETUP_UV; + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_UV; + TEAR_DOWN_UV_DEPS; + free(f); +} + +/****************************************************************************** + * + * raft_io->recover() + * + *****************************************************************************/ + +SUITE(recover) + +/* Invoke recover and assert that it fails with the given error. */ +#define RECOVER_ERROR(RV, CONF) \ + { \ + int rv_; \ + rv_ = f->io.recover(&f->io, CONF); \ + munit_assert_int(rv_, ==, RV); \ + } + +/* Invoke recover and assert that it succeeds */ +#define RECOVER(CONF) RECOVER_ERROR(0, CONF) + +/* If the instance has been already initialized, an error is returned. */ +/* A new configuration is saved as last entry on disk. */ +TEST(recover, newConfiguration, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_configuration configuration1; + struct raft_configuration configuration2; + int rv; + + /* Bootstrap using an initial configuration */ + raft_configuration_init(&configuration1); + rv = raft_configuration_add(&configuration1, 1, "1", RAFT_VOTER); + munit_assert_int(rv, ==, 0); + rv = raft_configuration_add(&configuration1, 2, "2", RAFT_VOTER); + munit_assert_int(rv, ==, 0); + rv = f->io.bootstrap(&f->io, &configuration1); + munit_assert_int(rv, ==, 0); + + /* Bootstrap using a different configuration */ + raft_configuration_init(&configuration2); + rv = raft_configuration_add(&configuration2, 1, "1", RAFT_VOTER); + munit_assert_int(rv, ==, 0); + + RECOVER(&configuration2); + + raft_configuration_close(&configuration1); + raft_configuration_close(&configuration2); + + return 0; +} diff --git a/test/raft/integration/test_uv_recv.c b/test/raft/integration/test_uv_recv.c new file mode 100644 index 000000000..9c49394d8 --- /dev/null +++ b/test/raft/integration/test_uv_recv.c @@ -0,0 +1,480 @@ +#include "../lib/runner.h" +#include "../lib/tcp.h" +#include "../lib/uv.h" + +/****************************************************************************** + * + * Fixture with a libuv-based raft_io instance. + * + *****************************************************************************/ + +struct peer +{ + struct uv_loop_s loop; + struct raft_uv_transport transport; + struct raft_io io; +}; + +struct fixture +{ + FIXTURE_UV_DEPS; + FIXTURE_TCP; + FIXTURE_UV; + struct peer peer; + bool closed; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +struct result +{ + struct raft_message *message; + bool done; +}; + +static void recvCb(struct raft_io *io, struct raft_message *m1) +{ + struct result *result = io->data; + struct raft_message *m2 = result->message; + unsigned i; + munit_assert_int(m1->type, ==, m2->type); + switch (m1->type) { + case RAFT_IO_REQUEST_VOTE: + munit_assert_int(m1->request_vote.term, ==, m2->request_vote.term); + munit_assert_int(m1->request_vote.candidate_id, ==, + m2->request_vote.candidate_id); + munit_assert_int(m1->request_vote.last_log_index, ==, + m2->request_vote.last_log_index); + munit_assert_int(m1->request_vote.last_log_term, ==, + m2->request_vote.last_log_term); + munit_assert_int(m1->request_vote.disrupt_leader, ==, + m2->request_vote.disrupt_leader); + break; + case RAFT_IO_REQUEST_VOTE_RESULT: + munit_assert_int(m1->request_vote_result.term, ==, + m2->request_vote_result.term); + munit_assert_int(m1->request_vote_result.vote_granted, ==, + m2->request_vote_result.vote_granted); + break; + case RAFT_IO_APPEND_ENTRIES: + munit_assert_int(m1->append_entries.n_entries, ==, + m2->append_entries.n_entries); + for (i = 0; i < m1->append_entries.n_entries; i++) { + struct raft_entry *entry1 = &m1->append_entries.entries[i]; + struct raft_entry *entry2 = &m2->append_entries.entries[i]; + munit_assert_int(entry1->term, ==, entry2->term); + munit_assert_int(entry1->type, ==, entry2->type); + munit_assert_int(entry1->buf.len, ==, entry2->buf.len); + munit_assert_int( + memcmp(entry1->buf.base, entry2->buf.base, entry1->buf.len), + ==, 0); + } + if (m1->append_entries.n_entries > 0) { + raft_free(m1->append_entries.entries[0].batch); + raft_free(m1->append_entries.entries); + } + break; + case RAFT_IO_APPEND_ENTRIES_RESULT: + munit_assert_int(m1->append_entries_result.term, ==, + m2->append_entries_result.term); + munit_assert_int(m1->append_entries_result.rejected, ==, + m2->append_entries_result.rejected); + munit_assert_int(m1->append_entries_result.last_log_index, ==, + m2->append_entries_result.last_log_index); + break; + case RAFT_IO_INSTALL_SNAPSHOT: + munit_assert_int(m1->install_snapshot.conf.n, ==, + m2->install_snapshot.conf.n); + for (i = 0; i < m1->install_snapshot.conf.n; i++) { + struct raft_server *s1 = &m1->install_snapshot.conf.servers[i]; + struct raft_server *s2 = &m2->install_snapshot.conf.servers[i]; + munit_assert_int(s1->id, ==, s2->id); + munit_assert_string_equal(s1->address, s2->address); + munit_assert_int(s1->role, ==, s2->role); + } + munit_assert_int(m1->install_snapshot.data.len, ==, + m2->install_snapshot.data.len); + munit_assert_int(memcmp(m1->install_snapshot.data.base, + m2->install_snapshot.data.base, + m2->install_snapshot.data.len), + ==, 0); + raft_configuration_close(&m1->install_snapshot.conf); + raft_free(m1->install_snapshot.data.base); + break; + case RAFT_IO_TIMEOUT_NOW: + munit_assert_int(m1->timeout_now.term, ==, m2->timeout_now.term); + munit_assert_int(m1->timeout_now.last_log_index, ==, + m2->timeout_now.last_log_index); + munit_assert_int(m1->timeout_now.last_log_term, ==, + m2->timeout_now.last_log_term); + break; + }; + result->done = true; +} + +static void peerSendCb(struct raft_io_send *req, int status) +{ + bool *done = req->data; + munit_assert_int(status, ==, 0); + *done = true; +} + +static void peerCloseCb(struct raft_io *io) +{ + bool *done = io->data; + *done = true; +} + +/* Set up the fixture's peer raft_io instance. */ +#define PEER_SETUP \ + do { \ + struct uv_loop_s *_loop = &f->peer.loop; \ + struct raft_uv_transport *_transport = &f->peer.transport; \ + struct raft_io *_io = &f->peer.io; \ + int _rv; \ + _rv = uv_loop_init(_loop); \ + munit_assert_int(_rv, ==, 0); \ + _transport->version = 1; \ + _rv = raft_uv_tcp_init(_transport, _loop); \ + munit_assert_int(_rv, ==, 0); \ + _rv = raft_uv_init(_io, _loop, f->dir, _transport); \ + munit_assert_int(_rv, ==, 0); \ + _rv = _io->init(_io, 2, "127.0.0.1:9002"); \ + munit_assert_int(_rv, ==, 0); \ + } while (0) + +/* Tear down the fixture's peer raft_io instance. */ +#define PEER_TEAR_DOWN \ + do { \ + struct uv_loop_s *_loop = &f->peer.loop; \ + struct raft_uv_transport *_transport = &f->peer.transport; \ + struct raft_io *_io = &f->peer.io; \ + bool _done = false; \ + int _i; \ + _done = false; \ + _io->data = &_done; \ + _io->close(_io, peerCloseCb); \ + for (_i = 0; _i < 10; _i++) { \ + if (_done) { \ + break; \ + } \ + uv_run(_loop, UV_RUN_ONCE); \ + } \ + uv_run(_loop, UV_RUN_DEFAULT); \ + munit_assert_true(_done); \ + raft_uv_close(_io); \ + raft_uv_tcp_close(_transport); \ + uv_loop_close(_loop); \ + } while (0) + +/* Send a message to the main fixture's raft_io instance using the fixture's + * peer instance. */ +#define PEER_SEND(MESSAGE) \ + do { \ + struct uv_loop_s *_loop = &f->peer.loop; \ + struct raft_io *_io = &f->peer.io; \ + struct raft_io_send _req; \ + bool _done = false; \ + int _i; \ + int _rv; \ + (MESSAGE)->server_id = 1; \ + (MESSAGE)->server_address = "127.0.0.1:9001"; \ + _req.data = &_done; \ + _rv = _io->send(_io, &_req, MESSAGE, peerSendCb); \ + munit_assert_int(_rv, ==, 0); \ + for (_i = 0; _i < 10; _i++) { \ + if (_done) { \ + break; \ + } \ + uv_run(_loop, UV_RUN_ONCE); \ + } \ + munit_assert_true(_done); \ + } while (0) + +/* Establish a connection and send an handshake using plain TCP. */ +#define PEER_HANDSHAKE \ + do { \ + uint8_t _handshake[] = { \ + 6, 6, 6, 0, 0, 0, 0, 0, /* Protocol */ \ + 1, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ \ + 2, 0, 0, 0, 0, 0, 0, 0, /* Address length, in words */ \ + 0, 0, 0, 0, 0, 0, 0, 0, /* First address word */ \ + 0, 0, 0, 0, 0, 0, 0, 0 /* Second address word */ \ + }; \ + sprintf((char *)&_handshake[24], "127.0.0.1:666"); \ + TCP_CLIENT_CONNECT(9001); \ + TCP_CLIENT_SEND(_handshake, sizeof _handshake); \ + } while (0); + +/* Run the loop until a new message is received. Assert that the received + * message matches the given one. */ +#define RECV(MESSAGE) \ + do { \ + struct result _result = {MESSAGE, false}; \ + f->io.data = &_result; \ + LOOP_RUN_UNTIL(&_result.done); \ + f->io.data = NULL; \ + } while (0) + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUpDeps(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_UV_DEPS; + SETUP_TCP; + PEER_SETUP; + f->io.data = f; + f->closed = false; + return f; +} + +static void tearDownDeps(void *data) +{ + struct fixture *f = data; + PEER_TEAR_DOWN; + TEAR_DOWN_TCP; + TEAR_DOWN_UV_DEPS; + free(f); +} + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = setUpDeps(params, user_data); + int rv; + SETUP_UV; + f->io.data = f; + rv = f->io.start(&f->io, 10000, NULL, recvCb); + munit_assert_int(rv, ==, 0); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_UV; + tearDownDeps(f); +} + +/****************************************************************************** + * + * raft_io_recv_cb + * + *****************************************************************************/ + +SUITE(recv) + +/* Receive the very first message over the connection. */ +TEST(recv, first, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_message message; + message.type = RAFT_IO_REQUEST_VOTE; + message.request_vote.candidate_id = 2; + message.request_vote.last_log_index = 123; + message.request_vote.last_log_term = 2; + message.request_vote.disrupt_leader = false; + PEER_SEND(&message); + RECV(&message); + return MUNIT_OK; +} + +/* Receive the a first message then another one. */ +TEST(recv, second, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_message message; + message.type = RAFT_IO_REQUEST_VOTE; + message.request_vote.candidate_id = 2; + message.request_vote.last_log_index = 123; + message.request_vote.last_log_term = 2; + message.request_vote.disrupt_leader = true; + PEER_SEND(&message); + RECV(&message); + PEER_SEND(&message); + RECV(&message); + return MUNIT_OK; +} + +/* Receive a RequestVote result message. */ +TEST(recv, requestVoteResult, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_message message; + message.type = RAFT_IO_REQUEST_VOTE_RESULT; + message.request_vote_result.term = 3; + message.request_vote_result.vote_granted = true; + message.request_vote_result.pre_vote = false; + PEER_SEND(&message); + RECV(&message); + return MUNIT_OK; +} + +/* Receive an AppendEntries message with two entries. */ +TEST(recv, appendEntries, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entries[2]; + struct raft_message message; + uint8_t data1[8] = {1, 2, 3, 4, 5, 6, 7, 8}; + uint8_t data2[8] = {8, 7, 6, 5, 4, 3, 2, 1}; + + entries[0].type = RAFT_COMMAND; + entries[0].buf.base = data1; + entries[0].buf.len = sizeof data1; + + entries[1].type = RAFT_COMMAND; + entries[1].buf.base = data2; + entries[1].buf.len = sizeof data2; + + message.type = RAFT_IO_APPEND_ENTRIES; + message.append_entries.entries = entries; + message.append_entries.n_entries = 2; + + PEER_SEND(&message); + RECV(&message); + + return MUNIT_OK; +} + +/* Receive an AppendEntries message with no entries (i.e. an heartbeat). */ +TEST(recv, heartbeat, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_message message; + message.type = RAFT_IO_APPEND_ENTRIES; + message.append_entries.entries = NULL; + message.append_entries.n_entries = 0; + PEER_SEND(&message); + RECV(&message); + return MUNIT_OK; +} + +/* Receive an AppendEntries result f->peer.message. */ +TEST(recv, appendEntriesResult, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_message message; + message.type = RAFT_IO_APPEND_ENTRIES_RESULT; + message.append_entries_result.term = 3; + message.append_entries_result.rejected = 0; + message.append_entries_result.last_log_index = 123; + PEER_SEND(&message); + RECV(&message); + return MUNIT_OK; +} + +/* Receive an InstallSnapshot message. */ +TEST(recv, installSnapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_message message; + uint8_t snapshot_data[8] = {1, 2, 3, 4, 5, 6, 7, 8}; + int rv; + + message.type = RAFT_IO_INSTALL_SNAPSHOT; + message.install_snapshot.term = 2; + message.install_snapshot.last_index = 123; + message.install_snapshot.last_term = 1; + raft_configuration_init(&message.install_snapshot.conf); + rv = raft_configuration_add(&message.install_snapshot.conf, 1, "1", + RAFT_VOTER); + munit_assert_int(rv, ==, 0); + message.install_snapshot.data.len = sizeof snapshot_data; + message.install_snapshot.data.base = snapshot_data; + + PEER_SEND(&message); + RECV(&message); + + raft_configuration_close(&message.install_snapshot.conf); + + return MUNIT_OK; +} + +/* Receive a TimeoutNow message. */ +TEST(recv, timeoutNow, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_message message; + message.type = RAFT_IO_TIMEOUT_NOW; + message.timeout_now.term = 3; + message.timeout_now.last_log_index = 123; + message.timeout_now.last_log_term = 2; + PEER_SEND(&message); + RECV(&message); + return MUNIT_OK; +} + +/* The handshake fails because of an unexpected protocon version. */ +TEST(recv, badProtocol, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint8_t handshake[] = { + 6, 6, 6, 0, 0, 0, 0, 0, /* Protocol */ + 1, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ + 2, 0, 0, 0, 0, 0, 0, 0 /* Address length */ + }; + TCP_CLIENT_CONNECT(9001); + TCP_CLIENT_SEND(handshake, sizeof handshake); + LOOP_RUN(2); + return MUNIT_OK; +} + +/* A message can't have zero length. */ +TEST(recv, badSize, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint8_t header[] = { + 1, 0, 0, 0, 0, 0, 0, 0, /* Message type */ + 0, 0, 0, 0, 0, 0, 0, 0 /* Message size */ + }; + PEER_HANDSHAKE; + TCP_CLIENT_SEND(header, sizeof header); + LOOP_RUN(2); + return MUNIT_OK; +} + +/* A message with a bad type causes the connection to be aborted. */ +TEST(recv, badType, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint8_t header[] = { + 1, 2, 3, 4, 5, 6, 7, 8, /* Message type */ + 0, 0, 0, 0, 0, 0, 0, 0 /* Message size */ + }; + PEER_HANDSHAKE; + TCP_CLIENT_SEND(header, sizeof header); + LOOP_RUN(2); + return MUNIT_OK; +} + +/* The backend is closed just before accepting a new connection. */ +TEST(recv, closeBeforeAccept, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + uint8_t header[] = { + 1, 2, 3, 4, 5, 6, 7, 8, /* Message type */ + 0, 0, 0, 0, 0, 0, 0, 0 /* Message size */ + }; + PEER_HANDSHAKE; + TCP_CLIENT_SEND(header, sizeof header); + LOOP_RUN(1); + TEAR_DOWN_UV; + return MUNIT_OK; +} + +/* The backend is closed after receiving the header of an AppendEntries + * message. */ +TEST(recv, closeAfterAppendEntriesHeader, setUp, tearDown, 0, NULL) +{ + /* TODO */ + return MUNIT_SKIP; +} diff --git a/test/raft/integration/test_uv_send.c b/test/raft/integration/test_uv_send.c new file mode 100644 index 000000000..056944a4d --- /dev/null +++ b/test/raft/integration/test_uv_send.c @@ -0,0 +1,413 @@ +#include + +#include "../lib/runner.h" +#include "../lib/tcp.h" +#include "../lib/uv.h" + +/****************************************************************************** + * + * Fixture with a libuv-based raft_io instance and some pre-set messages. + * + *****************************************************************************/ + +#define N_MESSAGES 5 + +struct fixture +{ + FIXTURE_UV_DEPS; + FIXTURE_TCP_SERVER; + FIXTURE_UV; + struct raft_message messages[N_MESSAGES]; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +struct result +{ + int status; + bool done; +}; + +static void sendCbAssertResult(struct raft_io_send *req, int status) +{ + struct result *result = req->data; + munit_assert_int(status, ==, result->status); + result->done = true; +} + +/* Get I'th fixture's message. */ +#define MESSAGE(I) (&f->messages[I]) + +/* Submit a send request for the I'th fixture's message. */ +#define SEND_SUBMIT(I, RV, STATUS) \ + struct raft_io_send _req##I; \ + struct result _result##I = {STATUS, false}; \ + int _rv##I; \ + _req##I.data = &_result##I; \ + _rv##I = \ + f->io.send(&f->io, &_req##I, &f->messages[I], sendCbAssertResult); \ + munit_assert_int(_rv##I, ==, RV) + +/* Wait for the submit request of the I'th message to finish. */ +#define SEND_WAIT(I) LOOP_RUN_UNTIL(&_result##I.done) + +/* Submit a send request for the I'th fixture's message and wait for the + * operation to successfully complete. */ +#define SEND(I) \ + do { \ + SEND_SUBMIT(I, 0 /* rv */, 0 /* status */); \ + SEND_WAIT(I); \ + } while (0) + +/* Submit a send request and assert that it fails synchronously with the + * given error code and message. */ +#define SEND_ERROR(I, RV, ERRMSG) \ + do { \ + SEND_SUBMIT(I, RV, 0 /* status */); \ + /* munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \ + } while (0) + +/* Submit a send request and wait for the operation to fail with the given code + * and message. */ +#define SEND_FAILURE(I, STATUS, ERRMSG) \ + do { \ + SEND_SUBMIT(I, 0 /* rv */, STATUS); \ + SEND_WAIT(I); \ + /*munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \ + } while (0) + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUpDeps(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_UV_DEPS; + SETUP_TCP_SERVER; + f->io.data = f; + return f; +} + +static void tearDownDeps(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_TCP_SERVER; + TEAR_DOWN_UV_DEPS; + free(f); +} + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = setUpDeps(params, user_data); + unsigned i; + SETUP_UV; + raft_uv_set_connect_retry_delay(&f->io, 1); + for (i = 0; i < N_MESSAGES; i++) { + struct raft_message *message = &f->messages[i]; + message->type = RAFT_IO_REQUEST_VOTE; + message->server_id = 1; + message->server_address = f->server.address; + } + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_UV; + tearDownDeps(f); +} + +/****************************************************************************** + * + * raft_io->send() + * + *****************************************************************************/ + +SUITE(send) + +/* The first time a request is sent to a server a connection attempt is + * triggered. If the connection succeeds the request gets written out. */ +TEST(send, first, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + SEND(0); + return MUNIT_OK; +} + +/* The second time a request is sent it re-uses the connection that was already + * established */ +TEST(send, second, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + SEND(0); + SEND(0); + return MUNIT_OK; +} + +/* Submit a few send requests in parallel. */ +TEST(send, parallel, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + SEND_SUBMIT(0 /* message */, 0 /* rv */, 0 /* status */); + SEND_SUBMIT(1 /* message */, 0 /* rv */, 0 /* status */); + SEND_WAIT(0); + SEND_WAIT(1); + return MUNIT_OK; +} + +/* Send a request vote result message. */ +TEST(send, voteResult, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + MESSAGE(0)->type = RAFT_IO_REQUEST_VOTE_RESULT; + SEND(0); + return MUNIT_OK; +} + +/* Send an append entries message. */ +TEST(send, appendEntries, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entries[2]; + entries[0].buf.base = raft_malloc(16); + entries[0].buf.len = 16; + entries[1].buf.base = raft_malloc(8); + entries[1].buf.len = 8; + + MESSAGE(0)->type = RAFT_IO_APPEND_ENTRIES; + MESSAGE(0)->append_entries.entries = entries; + MESSAGE(0)->append_entries.n_entries = 2; + + SEND(0); + + raft_free(entries[0].buf.base); + raft_free(entries[1].buf.base); + + return MUNIT_OK; +} + +/* Send an append entries message with zero entries (i.e. a heartbeat). */ +TEST(send, heartbeat, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + MESSAGE(0)->type = RAFT_IO_APPEND_ENTRIES; + MESSAGE(0)->append_entries.entries = NULL; + MESSAGE(0)->append_entries.n_entries = 0; + SEND(0); + return MUNIT_OK; +} + +/* Send an append entries result message. */ +TEST(send, appendEntriesResult, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + MESSAGE(0)->type = RAFT_IO_APPEND_ENTRIES_RESULT; + SEND(0); + return MUNIT_OK; +} + +/* Send an install snapshot message. */ +TEST(send, installSnapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_install_snapshot *p = &MESSAGE(0)->install_snapshot; + int rv; + + MESSAGE(0)->type = RAFT_IO_INSTALL_SNAPSHOT; + + raft_configuration_init(&p->conf); + rv = raft_configuration_add(&p->conf, 1, "1", RAFT_VOTER); + munit_assert_int(rv, ==, 0); + + p->data.len = 8; + p->data.base = raft_malloc(p->data.len); + + SEND(0); + + raft_configuration_close(&p->conf); + raft_free(p->data.base); + + return MUNIT_OK; +} + +/* A connection attempt fails asynchronously after the connect function + * returns. */ +TEST(send, noConnection, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + MESSAGE(0)->server_address = "127.0.0.1:123456"; + SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); + TEAR_DOWN_UV; + return MUNIT_OK; +} + +/* The message has an invalid IPv4 address. */ +TEST(send, badAddress, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + MESSAGE(0)->server_address = "1"; + SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); + TEAR_DOWN_UV; + return MUNIT_OK; +} + +/* Make sure UvSend doesn't use a stale connection for a certain server id + * by first sending a message to a valid address and then sending a message to + * an invalid address, making sure the valid connection is not reused. + * Afterwards assert that a send to the correct address still succeeds. */ +TEST(send, changeToUnconnectedAddress, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + + /* Send a message to a server and a connected address */ + SEND(0); + + /* Send a message to the same server, but update the address to an + * unconnected address and assert it fails. */ + munit_assert_ullong(MESSAGE(0)->server_id, ==, MESSAGE(1)->server_id); + MESSAGE(1)->server_address = "127.0.0.2:1"; + SEND_SUBMIT(1 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); + + /* Send another message to the same server and connected address */ + munit_assert_ullong(MESSAGE(0)->server_id, ==, MESSAGE(2)->server_id); + SEND(2); + + /* Send another message to the same server and connected address */ + munit_assert_ullong(MESSAGE(0)->server_id, ==, MESSAGE(3)->server_id); + SEND(3); + + TEAR_DOWN_UV; + return MUNIT_OK; +} + +/* The message has an invalid type. */ +TEST(send, badMessage, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + MESSAGE(0)->type = 666; + SEND_ERROR(0, RAFT_MALFORMED, ""); + return MUNIT_OK; +} + +/* Old send requests that have accumulated and could not yet be sent are + * progressively evicted. */ +TEST(send, evictOldPending, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + TCP_SERVER_STOP; + SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_NOCONNECTION /* status */); + SEND_SUBMIT(1 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); + SEND_SUBMIT(2 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); + SEND_SUBMIT(3 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); + SEND_WAIT(0); + TEAR_DOWN_UV; + return MUNIT_OK; +} + +/* After the connection is established the peer dies and then comes back a + * little bit later. */ +TEST(send, reconnectAfterWriteError, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + int socket; + SEND(0); + socket = TcpServerAccept(&f->server); + close(socket); + SEND_FAILURE(0, RAFT_IOERR, ""); + SEND(0); + return MUNIT_OK; +} + +/* After the connection is established the peer dies and then comes back a + * little bit later. At the time the peer died there where several writes + * pending. */ +TEST(send, reconnectAfterMultipleWriteErrors, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + int socket; + signal(SIGPIPE, SIG_IGN); + SEND(0); + socket = TcpServerAccept(&f->server); + close(socket); + SEND_SUBMIT(1 /* message */, 0 /* rv */, RAFT_IOERR /* status */); + SEND_SUBMIT(2 /* message */, 0 /* rv */, RAFT_IOERR /* status */); + SEND_WAIT(1); + SEND_WAIT(2); + SEND(3); + return MUNIT_OK; +} + +static char *oomHeapFaultDelay[] = {"0", "1", "2", "3", "4", NULL}; +static char *oomHeapFaultRepeat[] = {"1", NULL}; + +static MunitParameterEnum oomParams[] = { + {TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay}, + {TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat}, + {NULL, NULL}, +}; + +/* Out of memory conditions. */ +TEST(send, oom, setUp, tearDown, 0, oomParams) +{ + struct fixture *f = data; + HEAP_FAULT_ENABLE; + SEND_ERROR(0, RAFT_NOMEM, ""); + return MUNIT_OK; +} + +static char *oomAsyncHeapFaultDelay[] = {"2", NULL}; +static char *oomAsyncHeapFaultRepeat[] = {"1", NULL}; + +static MunitParameterEnum oomAsyncParams[] = { + {TEST_HEAP_FAULT_DELAY, oomAsyncHeapFaultDelay}, + {TEST_HEAP_FAULT_REPEAT, oomAsyncHeapFaultRepeat}, + {NULL, NULL}, +}; + +/* Transient out of memory error happening after send() has returned. */ +TEST(send, oomAsync, setUp, tearDown, 0, oomAsyncParams) +{ + struct fixture *f = data; + SEND(0); + return MUNIT_OK; +} + +/* The backend gets closed while there is a pending write. */ +TEST(send, closeDuringWrite, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry entry; + + /* Set a very large message that is likely to fill the socket buffer. + * TODO: figure a more deterministic way to choose the value. */ + entry.buf.len = 1024 * 1024 * 8; + entry.buf.base = raft_malloc(entry.buf.len); + + MESSAGE(0)->type = RAFT_IO_APPEND_ENTRIES; + MESSAGE(0)->append_entries.entries = &entry; + MESSAGE(0)->append_entries.n_entries = 1; + + SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); + TEAR_DOWN_UV; + + raft_free(entry.buf.base); + + return MUNIT_OK; +} + +/* The backend gets closed while there is a pending connect request. */ +TEST(send, closeDuringConnection, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */); + TEAR_DOWN_UV; + return MUNIT_OK; +} diff --git a/test/raft/integration/test_uv_set_term.c b/test/raft/integration/test_uv_set_term.c new file mode 100644 index 000000000..7329b4568 --- /dev/null +++ b/test/raft/integration/test_uv_set_term.c @@ -0,0 +1,242 @@ +#include "../../../src/raft.h" +#include "../../../src/raft/byte.h" +#include "../lib/runner.h" +#include "../lib/uv.h" + +/****************************************************************************** + * + * Fixture with a libuv-based raft_io instance. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_UV_DEPS; + FIXTURE_UV; + bool closed; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +static void closeCb(struct raft_io *io) +{ + struct fixture *f = io->data; + f->closed = true; +} + +/* Invoke raft_uv_init() and assert that no error occurs. */ +#define INIT \ + do { \ + int _rv; \ + _rv = raft_uv_init(&f->io, &f->loop, f->dir, &f->transport); \ + munit_assert_int(_rv, ==, 0); \ + _rv = f->io.init(&f->io, 1, "1"); \ + munit_assert_int(_rv, ==, 0); \ + } while (0) + +/* Invoke raft_io->close(). */ +#define CLOSE \ + do { \ + f->io.close(&f->io, closeCb); \ + LOOP_RUN_UNTIL(&f->closed); \ + raft_uv_close(&f->io); \ + } while (0) + +/* Invoke f->io->set_term() and assert that no error occurs. */ +#define SET_TERM(TERM) \ + do { \ + int _rv; \ + _rv = f->io.set_term(&f->io, TERM); \ + munit_assert_int(_rv, ==, 0); \ + } while (0) + +/* Invoke f->io->set_term() and assert that the given error code is returned and + * the given error message set. */ +#define SET_TERM_ERROR(TERM, RV, ERRMSG) \ + do { \ + int _rv; \ + _rv = f->io.set_term(&f->io, TERM); \ + munit_assert_int(_rv, ==, RV); \ + munit_assert_string_equal(f->io.errmsg_(&f->io), ERRMSG); \ + } while (0) + +/* Write either the metadata1 or metadata2 file, filling it with the given + * values. */ +#define WRITE_METADATA_FILE(N, FORMAT, VERSION, TERM, VOTED_FOR) \ + { \ + uint8_t buf[8 * 4]; \ + void *cursor = buf; \ + char filename[strlen("metadataN") + 1]; \ + sprintf(filename, "metadata%d", N); \ + bytePut64(&cursor, FORMAT); \ + bytePut64(&cursor, VERSION); \ + bytePut64(&cursor, TERM); \ + bytePut64(&cursor, VOTED_FOR); \ + DirWriteFile(f->dir, filename, buf, sizeof buf); \ + } + +/* Assert that the content of either the metadata1 or metadata2 file match the + * given values. */ +#define ASSERT_METADATA_FILE(N, VERSION, TERM, VOTED_FOR) \ + { \ + uint8_t buf2[8 * 4]; \ + const void *cursor = buf2; \ + char filename[strlen("metadataN") + 1]; \ + sprintf(filename, "metadata%d", N); \ + DirReadFile(f->dir, filename, buf2, sizeof buf2); \ + munit_assert_int(byteGet64(&cursor), ==, 1); \ + munit_assert_int(byteGet64(&cursor), ==, VERSION); \ + munit_assert_int(byteGet64(&cursor), ==, TERM); \ + munit_assert_int(byteGet64(&cursor), ==, VOTED_FOR); \ + } + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUpDeps(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_UV_DEPS; + f->io.data = f; + f->closed = false; + return f; +} + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = setUpDeps(params, user_data); + INIT; + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + CLOSE; + TEAR_DOWN_UV_DEPS; + free(f); +} + +/****************************************************************************** + * + * raft_io->set_term() + * + *****************************************************************************/ + +SUITE(set_term) + +/* The very first time set_term() is called, the metadata1 file gets written. */ +TEST(set_term, first, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + SET_TERM(1); + ASSERT_METADATA_FILE(1, 1, 1, 0); + munit_assert_false(DirHasFile(f->dir, "metadata2")); + return MUNIT_OK; +} + +/* The second time set_term() is called, the metadata2 file gets written. */ +TEST(set_term, second, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + SET_TERM(1); + SET_TERM(2); + ASSERT_METADATA_FILE(1, 1, 1, 0); + ASSERT_METADATA_FILE(2, 2, 2, 0); + return MUNIT_OK; +} + +/* The third time set_term() is called, the metadata1 file gets overwritten. */ +TEST(set_term, third, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + SET_TERM(1); + SET_TERM(2); + SET_TERM(3); + ASSERT_METADATA_FILE(1, 3, 3, 0); + ASSERT_METADATA_FILE(2, 2, 2, 0); + return MUNIT_OK; +} + +/* The fourth time set_term() is called, the metadata2 file gets overwritten. */ +TEST(set_term, fourth, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + SET_TERM(1); + SET_TERM(2); + SET_TERM(3); + SET_TERM(4); + ASSERT_METADATA_FILE(1, 3, 3, 0); + ASSERT_METADATA_FILE(2, 4, 4, 0); + return MUNIT_OK; +} + +/* If the data directory has a single metadata1 file, the first time set_data() + * is called, the second metadata file gets created. */ +TEST(set_term, metadataOneExists, setUpDeps, tearDown, 0, NULL) +{ + struct fixture *f = data; + WRITE_METADATA_FILE(1, /* Metadata file index */ + 1, /* Format */ + 1, /* Version */ + 1, /* Term */ + 0 /* Voted for */); + INIT; + SET_TERM(2); + ASSERT_METADATA_FILE(1, 1, 1, 0); + ASSERT_METADATA_FILE(2, 2, 2, 0); + return MUNIT_OK; +} + +/* The data directory has both metadata files, but metadata1 is greater. */ +TEST(set_term, metadataOneIsGreater, setUpDeps, tearDown, 0, NULL) +{ + struct fixture *f = data; + WRITE_METADATA_FILE(1, /* Metadata file index */ + 1, /* Format */ + 3, /* Version */ + 3, /* Term */ + 0 /* Voted for */); + WRITE_METADATA_FILE(2, /* Metadata file index */ + 1, /* Format */ + 2, /* Version */ + 2, /* Term */ + 0 /* Voted for */); + INIT; + SET_TERM(4); + ASSERT_METADATA_FILE(1 /* n */, 3 /* version */, 3 /* term */, + 0 /* voted for */); + ASSERT_METADATA_FILE(2 /* n */, 4 /* version */, 4 /* term */, + 0 /* voted for */); + return MUNIT_OK; +} + +/* The data directory has both metadata files, but metadata2 is greater. */ +TEST(set_term, metadataTwoIsGreater, setUpDeps, tearDown, 0, NULL) +{ + struct fixture *f = data; + WRITE_METADATA_FILE(1, /* Metadata file index */ + 1, /* Format */ + 1, /* Version */ + 1, /* Term */ + 0 /* Voted for */); + WRITE_METADATA_FILE(2, /* Metadata file index */ + 1, /* Format */ + 2, /* Version */ + 2, /* Term */ + 0 /* Voted for */); + INIT; + SET_TERM(2); + ASSERT_METADATA_FILE(1 /* n */, 3 /* version */, 2 /* term */, + 0 /* voted for */); + ASSERT_METADATA_FILE(2 /* n */, 2 /* version */, 2 /* term */, + 0 /* voted for */); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_uv_snapshot_put.c b/test/raft/integration/test_uv_snapshot_put.c new file mode 100644 index 000000000..5e33b3e2a --- /dev/null +++ b/test/raft/integration/test_uv_snapshot_put.c @@ -0,0 +1,315 @@ +#include + +#include "../lib/runner.h" +#include "../lib/tcp.h" +#include "../lib/uv.h" +#include "append_helpers.h" + +/****************************************************************************** + * + * Fixture with a libuv-based raft_io instance. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_UV_DEPS; + FIXTURE_UV; + bool closed; + int count; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +struct snapshot +{ + raft_term term; + raft_index index; + uint64_t data; + bool done; +}; + +static void snapshotPutCbAssertResult(struct raft_io_snapshot_put *req, + int status) +{ + struct result *result = req->data; + munit_assert_int(status, ==, result->status); + result->done = true; +} + +static void snapshotGetCbAssertResult(struct raft_io_snapshot_get *req, + struct raft_snapshot *snapshot, + int status) +{ + struct snapshot *expect = req->data; + munit_assert_int(status, ==, 0); + munit_assert_ptr_not_null(snapshot); + munit_assert_int(snapshot->term, ==, expect->term); + munit_assert_int(snapshot->index, ==, snapshot->index); + expect->done = true; + raft_configuration_close(&snapshot->configuration); + raft_free(snapshot->bufs[0].base); + raft_free(snapshot->bufs); + raft_free(snapshot); +} + +/* Submit a request to truncate the log at N */ +#define TRUNCATE(N) \ + { \ + int _rv; \ + _rv = f->io.truncate(&f->io, N); \ + munit_assert_int(_rv, ==, 0); \ + } + +#define SNAPSHOT_PUT_REQ(TRAILING, INDEX, RV, STATUS) \ + struct raft_snapshot _snapshot; \ + struct raft_buffer _snapshot_buf; \ + uint64_t _snapshot_data; \ + struct raft_io_snapshot_put _req; \ + struct result _result = {STATUS, false, NULL}; \ + int _rv; \ + _snapshot.term = 1; \ + _snapshot.index = INDEX; \ + raft_configuration_init(&_snapshot.configuration); \ + _rv = raft_configuration_add(&_snapshot.configuration, 1, "1", \ + RAFT_STANDBY); \ + munit_assert_int(_rv, ==, 0); \ + _snapshot.bufs = &_snapshot_buf; \ + _snapshot.n_bufs = 1; \ + _snapshot_buf.base = &_snapshot_data; \ + _snapshot_buf.len = sizeof _snapshot_data; \ + _req.data = &_result; \ + _rv = f->io.snapshot_put(&f->io, TRAILING, &_req, &_snapshot, \ + snapshotPutCbAssertResult); \ + munit_assert_int(_rv, ==, RV) + +/* Submit a snapshot put request for the given snapshot and wait for the + * operation to successfully complete. */ +#define SNAPSHOT_PUT(TRAILING, INDEX) \ + do { \ + SNAPSHOT_PUT_REQ(TRAILING, INDEX, 0 /* rv */, 0 /* status */); \ + LOOP_RUN_UNTIL(&_result.done); \ + raft_configuration_close(&_snapshot.configuration); \ + } while (0) + +/* Submit a snapshot put request and assert that it fails synchronously with the + * given error code and message. */ +#define SNAPSHOT_PUT_ERROR(SNAPSHOT, TRAILING, RV, ERRMSG) \ + do { \ + SNAPSHOT_PUT_REQ(SNAPSHOT, TRAILING, RV, 0 /* status */); \ + /* munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \ + } while (0) + +/* Submit a snapshot put request and wait for the operation to fail with the + * given code and message. */ +#define SNAPSHOT_PUT_FAILURE(STATUS, ERRMSG) \ + do { \ + SNAPSHOT_PUT_REQ(0 /* rv */, STATUS); \ + LOOP_RUN_UNTIL(&_result.done); \ + /*munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \ + } while (0) + +/* Use raft_io->snapshot_get to load the last snapshot and compare it with the + * given parameters. */ +#define ASSERT_SNAPSHOT(TERM, INDEX, DATA) \ + do { \ + struct raft_io_snapshot_get _req; \ + struct snapshot _expect = {TERM, INDEX, DATA, false}; \ + int _rv; \ + _req.data = &_expect; \ + _rv = f->io.snapshot_get(&f->io, &_req, snapshotGetCbAssertResult); \ + munit_assert_int(_rv, ==, 0); \ + LOOP_RUN_UNTIL(&_expect.done); \ + } while (0) + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUpDeps(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_UV_DEPS; + f->io.data = f; + f->closed = false; + return f; +} + +static void tearDownDeps(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_UV_DEPS; + free(f); +} + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = setUpDeps(params, user_data); + SETUP_UV; + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_UV; + tearDownDeps(f); +} + +/****************************************************************************** + * + * raft_io->snapshot_put + * + *****************************************************************************/ + +SUITE(snapshot_put) + +/* Put the first snapshot. */ +TEST(snapshot_put, first, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + SNAPSHOT_PUT(10, /* trailing */ + 1 /* index */ + ); + ASSERT_SNAPSHOT(1, 1, 1); + return MUNIT_OK; +} + +/* If the number of closed entries is less than the given trailing amount, no + * segment is deleted. */ +TEST(snapshot_put, entriesLessThanTrailing, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + unsigned i; + raft_uv_set_segment_size( + &f->io, 4096); /* Lower the number of block to force finalizing */ + + for (i = 0; i < 40; i++) { + APPEND(10, 8); + } + + SNAPSHOT_PUT(128, /* trailing */ + 100 /* index */ + ); + + munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000150")); + munit_assert_true(DirHasFile(f->dir, "0000000000000151-0000000000000300")); + + return MUNIT_OK; +} + +/* If the number of closed entries is greater than the given trailing amount, + * closed segments that are fully past the trailing amount get deleted. */ +TEST(snapshot_put, entriesMoreThanTrailing, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + unsigned i; + raft_uv_set_segment_size( + &f->io, 4096); /* Lower the number of block to force finalizing */ + + for (i = 0; i < 40; i++) { + APPEND(10, 8); + } + + SNAPSHOT_PUT(128, /* trailing */ + 280 /* index */ + ); + + munit_assert_false(DirHasFile(f->dir, "0000000000000001-0000000000000150")); + munit_assert_true(DirHasFile(f->dir, "0000000000000151-0000000000000300")); + + return MUNIT_OK; +} + +/* Request to install a snapshot. */ +TEST(snapshot_put, install, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(4, 8); + SNAPSHOT_PUT(0, /* trailing */ + 1 /* index */ + ); + return MUNIT_OK; +} + +/* Request to install a snapshot without compression. */ +TEST(snapshot_put, installNoCompression, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + raft_uv_set_snapshot_compression(&f->io, false); + APPEND(4, 8); + SNAPSHOT_PUT(0, /* trailing */ + 1 /* index */ + ); + return MUNIT_OK; +} + +/* Request to install a snapshot, no previous entry is present. */ +TEST(snapshot_put, installWithoutPreviousEntries, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + SNAPSHOT_PUT(0, /* trailing */ + 1 /* index */ + ); + return MUNIT_OK; +} + +/* Request to install a couple of snapshots in a row, no previous entry is + * present. */ +TEST(snapshot_put, + installMultipleWithoutPreviousEntries, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + SNAPSHOT_PUT(0, /* trailing */ + 1 /* index */ + ); + SNAPSHOT_PUT(0, /* trailing */ + 3 /* index */ + ); + SNAPSHOT_PUT(0, /* trailing */ + 1337 /* index */ + ); + return MUNIT_OK; +} + +/* Request to install a couple of snapshots in a row, AppendEntries Requests + * happen before, meanwhile and after */ +TEST(snapshot_put, + installMultipleAppendEntriesInBetween, + setUp, + tearDown, + 0, + NULL) +{ + struct fixture *f = data; + + APPEND_SUBMIT(0, 256, 8); + APPEND_SUBMIT(1, 256, 8); + SNAPSHOT_PUT(0, /* trailing */ + 1 /* index */ + ); + APPEND_WAIT(0); + APPEND_WAIT(1); + APPEND_SUBMIT(2, 256, 8); + APPEND_SUBMIT(3, 256, 8); + SNAPSHOT_PUT(0, /* trailing */ + 100 /* index */ + ); + APPEND_WAIT(2); + APPEND_WAIT(3); + APPEND_SUBMIT(4, 256, 8); + APPEND_SUBMIT(5, 256, 8); + APPEND_WAIT(4); + APPEND_WAIT(5); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_uv_tcp_connect.c b/test/raft/integration/test_uv_tcp_connect.c new file mode 100644 index 000000000..7efc68c60 --- /dev/null +++ b/test/raft/integration/test_uv_tcp_connect.c @@ -0,0 +1,358 @@ +#include "../../../src/raft.h" +#include "../../../src/raft.h" +#include "../lib/addrinfo.h" +#include "../lib/heap.h" +#include "../lib/loop.h" +#include "../lib/runner.h" +#include "../lib/tcp.h" + +/****************************************************************************** + * + * Fixture with a TCP-based raft_uv_transport. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_HEAP; + FIXTURE_LOOP; + FIXTURE_TCP_SERVER; + struct raft_uv_transport transport; + bool closed; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +struct result +{ + int status; + bool done; +}; + +static void closeCb(struct raft_uv_transport *transport) +{ + struct fixture *f = transport->data; + f->closed = true; +} + +static void connectCbAssertResult(struct raft_uv_connect *req, + struct uv_stream_s *stream, + int status) +{ + struct result *result = req->data; + munit_assert_int(status, ==, result->status); + if (status == 0) { + uv_close((struct uv_handle_s *)stream, (uv_close_cb)raft_free); + } + result->done = true; +} + +#define INIT \ + do { \ + int _rv; \ + _rv = f->transport.init(&f->transport, 1, "127.0.0.1:9000"); \ + munit_assert_int(_rv, ==, 0); \ + f->transport.data = f; \ + f->closed = false; \ + } while (0) + +#define CLOSE_SUBMIT \ + munit_assert_false(f->closed); \ + f->transport.close(&f->transport, closeCb); + +#define CLOSE_WAIT LOOP_RUN_UNTIL(&f->closed) +#define CLOSE \ + CLOSE_SUBMIT; \ + CLOSE_WAIT + +#define CONNECT_REQ(ID, ADDRESS, RV, STATUS) \ + struct raft_uv_connect _req; \ + struct result _result = {STATUS, false}; \ + int _rv; \ + _req.data = &_result; \ + _rv = f->transport.connect(&f->transport, &_req, ID, ADDRESS, \ + connectCbAssertResult); \ + munit_assert_int(_rv, ==, RV) + +/* Try to submit a connect request and assert that the given error code and + * message are returned. */ +#define CONNECT_ERROR(ID, ADDRESS, RV, ERRMSG) \ + { \ + CONNECT_REQ(ID, ADDRESS, RV /* rv */, 0 /* status */); \ + munit_assert_string_equal(f->transport.errmsg, ERRMSG); \ + } + +/* Submit a connect request with the given parameters and wait for the operation + * to successfully complete. */ +#define CONNECT(ID, ADDRESS) \ + { \ + CONNECT_REQ(ID, ADDRESS, 0 /* rv */, 0 /* status */); \ + LOOP_RUN_UNTIL(&_result.done); \ + } + +/* Submit a connect request with the given parameters and wait for the operation + * to fail with the given code and message. */ +#define CONNECT_FAILURE(ID, ADDRESS, STATUS, ERRMSG) \ + { \ + CONNECT_REQ(ID, ADDRESS, 0 /* rv */, STATUS); \ + LOOP_RUN_UNTIL(&_result.done); \ + munit_assert_string_equal(f->transport.errmsg, ERRMSG); \ + } + +/* Submit a connect request with the given parameters, close the transport after + * N loop iterations and assert that the request got canceled. */ +#define CONNECT_CLOSE(ID, ADDRESS, N) \ + { \ + CONNECT_REQ(ID, ADDRESS, 0 /* rv */, RAFT_CANCELED); \ + LOOP_RUN(N); \ + CLOSE_SUBMIT; \ + munit_assert_false(_result.done); \ + LOOP_RUN_UNTIL(&_result.done); \ + CLOSE_WAIT; \ + } + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUpDeps(const MunitParameter params[], + MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + int rv; + SET_UP_ADDRINFO; + SET_UP_HEAP; + SETUP_LOOP; + SETUP_TCP_SERVER; + f->transport.version = 1; + rv = raft_uv_tcp_init(&f->transport, &f->loop); + munit_assert_int(rv, ==, 0); + return f; +} + +static void tearDownDeps(void *data) +{ + struct fixture *f = data; + LOOP_STOP; + raft_uv_tcp_close(&f->transport); + TEAR_DOWN_TCP_SERVER; + TEAR_DOWN_LOOP; + TEAR_DOWN_HEAP; + TEAR_DOWN_ADDRINFO; + free(f); +} + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = setUpDeps(params, user_data); + INIT; + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + CLOSE; + tearDownDeps(f); +} + +/****************************************************************************** + * + * raft_uv_transport->connect() + * + *****************************************************************************/ + +#define BOGUS_ADDRESS "127.0.0.1:6666" +#define INVALID_ADDRESS "500.0.0.1:6666" + +SUITE(tcp_connect) + +/* Successfully connect to the peer by IP */ +TEST(tcp_connect, first, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + CONNECT(2, TCP_SERVER_ADDRESS); + return MUNIT_OK; +} + +/* Successfully connect to the peer by hostname */ +TEST(tcp_connect, connectByName, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + char host_adress[256]; + sprintf(host_adress, "localhost:%d", TCP_SERVER_PORT); + CONNECT(2, host_adress); + return MUNIT_OK; +} + +/* Successfully connect to the peer by first IP */ +TEST(tcp_connect, firstIP, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + const struct AddrinfoResult results[] = {{"127.0.0.1", TCP_SERVER_PORT}, + {"192.0.2.0", 6666}}; + AddrinfoInjectSetResponse(0, 2, results); + CONNECT(2, "any-host"); + return MUNIT_OK; +} + +/* Successfully connect to the peer by second IP */ +TEST(tcp_connect, secondIP, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + const struct AddrinfoResult results[] = {{"127.0.0.1", .6666}, + {"127.0.0.1", TCP_SERVER_PORT}}; + + AddrinfoInjectSetResponse(0, 2, results); + CONNECT(2, "any-host"); + return MUNIT_OK; +} + +/* The peer has shutdown */ +TEST(tcp_connect, refused, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + TCP_SERVER_STOP; + CONNECT_FAILURE(2, BOGUS_ADDRESS, RAFT_NOCONNECTION, + "uv_tcp_connect(): connection refused"); + return MUNIT_OK; +} + +static char *oomHeapFaultDelay[] = {"0", "1", "2", NULL}; +static char *oomHeapFaultRepeat[] = {"1", NULL}; + +static MunitParameterEnum oomParams[] = { + {TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay}, + {TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat}, + {NULL, NULL}, +}; + +/* Out of memory conditions. */ +TEST(tcp_connect, oom, setUp, tearDown, 0, oomParams) +{ + struct fixture *f = data; + HEAP_FAULT_ENABLE; + CONNECT_ERROR(2, BOGUS_ADDRESS, RAFT_NOMEM, "out of memory"); + return MUNIT_OK; +} + +/* The transport is closed immediately after a connect request as been + * submitted. The request's callback is invoked with RAFT_CANCELED. */ +TEST(tcp_connect, closeImmediately, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + CONNECT_CLOSE(2, TCP_SERVER_ADDRESS, 0); + return MUNIT_OK; +} + +/* The transport gets closed during the dns lookup */ +TEST(tcp_connect, closeDuringDnsLookup, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + + CONNECT_CLOSE(2, TCP_SERVER_ADDRESS, 1); + return MUNIT_OK; +} + +/* The transport gets closed during the handshake. */ +TEST(tcp_connect, closeDuringHandshake, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + + /* This test fails for libuv version >= 1.44.2 due to changes in uv_run + * whereby queueing and processing the write_cb happen in the same loop + * iteration, not leaving us a chance to close without going through a lot + * of hoops. + * https://github.com/libuv/libuv/pull/3598 */ + unsigned incompatible_uv = (1 << 16) | (44 << 8) | 2; + if (uv_version() >= incompatible_uv) { + CLOSE; + return MUNIT_SKIP; + } + + CONNECT_CLOSE(2, TCP_SERVER_ADDRESS, 2); + return MUNIT_OK; +} + +static void checkCb(struct uv_check_s *check) +{ + struct fixture *f = check->data; + CLOSE_SUBMIT; + uv_close((struct uv_handle_s *)check, NULL); +} + +/* The transport gets closed right after a dns lookup failure, while the + * connection attempt is being aborted. */ +TEST(tcp_connect, closeDuringDnsLookupAbort, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + struct uv_check_s check; + int rv; + /* Use a check handle in order to close the transport in the same loop + * iteration where the dns failure lookup occurs */ + rv = uv_check_init(&f->loop, &check); + munit_assert_int(rv, ==, 0); + check.data = f; + uv_check_start(&check, checkCb); + CONNECT_REQ(2, INVALID_ADDRESS, 0, RAFT_NOCONNECTION); + LOOP_RUN(1); + LOOP_RUN_UNTIL(&_result.done); + CLOSE_WAIT; + return MUNIT_OK; +} + +/* The transport gets closed right after a connection failure, while the + * connection attempt is being aborted. */ +TEST(tcp_connect, closeDuringConnectAbort, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + struct uv_check_s check; + int rv; + + /* Use a check handle in order to close the transport in the same loop + * iteration where the connection failure occurs. */ + rv = uv_check_init(&f->loop, &check); + munit_assert_int(rv, ==, 0); + check.data = f; + CONNECT_REQ(2, BOGUS_ADDRESS, 0, RAFT_NOCONNECTION); + /* Successfull DNS lookup will initiate async connect */ + LOOP_RUN(1); + uv_check_start(&check, checkCb); + LOOP_RUN(1); + LOOP_RUN_UNTIL(&_result.done); + CLOSE_WAIT; + return MUNIT_OK; +} + +/* The transport gets closed right after the first connection attempt failed, + * while doing a second connection attempt. */ +TEST(tcp_connect, closeDuringSecondConnect, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + struct uv_check_s check; + int rv; + const struct AddrinfoResult results[] = {{"127.0.0.1", .6666}, + {"127.0.0.1", TCP_SERVER_PORT}}; + + AddrinfoInjectSetResponse(0, 2, results); + + /* Use a check handle in order to close the transport in the same loop + * iteration where the second connection attempt occurs. */ + rv = uv_check_init(&f->loop, &check); + munit_assert_int(rv, ==, 0); + check.data = f; + CONNECT_REQ(2, "any-host", 0, RAFT_CANCELED); + /* Successfull DNS lookup will initiate async connect */ + LOOP_RUN(1); + uv_check_start(&check, checkCb); + LOOP_RUN(1); + LOOP_RUN_UNTIL(&_result.done); + CLOSE_WAIT; + return MUNIT_OK; +} diff --git a/test/raft/integration/test_uv_tcp_listen.c b/test/raft/integration/test_uv_tcp_listen.c new file mode 100644 index 000000000..b239cfa87 --- /dev/null +++ b/test/raft/integration/test_uv_tcp_listen.c @@ -0,0 +1,416 @@ +#include "../../../src/raft.h" +#include "../../../src/raft/byte.h" +#include "../lib/addrinfo.h" +#include "../lib/heap.h" +#include "../lib/loop.h" +#include "../lib/runner.h" +#include "../lib/tcp.h" + +/****************************************************************************** + * + * Fixture with a TCP-based raft_uv_transport. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_HEAP; + FIXTURE_LOOP; + FIXTURE_TCP; + struct raft_uv_transport transport; + bool accepted; + bool closed; + struct + { + uint8_t buf[sizeof(uint64_t) + /* Protocol version */ + sizeof(uint64_t) + /* Server ID */ + sizeof(uint64_t) + /* Length of address */ + sizeof(uint64_t) * 2 /* Address */]; + size_t offset; + } handshake; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +#define PEER_ID 2 +#define PEER_ADDRESS "127.0.0.1:666" + +static void closeCb(struct raft_uv_transport *transport) +{ + struct fixture *f = transport->data; + f->closed = true; +} + +static void acceptCb(struct raft_uv_transport *t, + raft_id id, + const char *address, + struct uv_stream_s *stream) +{ + struct fixture *f = t->data; + munit_assert_int(id, ==, PEER_ID); + munit_assert_string_equal(address, PEER_ADDRESS); + f->accepted = true; + uv_close((struct uv_handle_s *)stream, (uv_close_cb)raft_free); +} + +#define INIT \ + do { \ + int _rv; \ + f->transport.version = 1; \ + _rv = raft_uv_tcp_init(&f->transport, &f->loop); \ + munit_assert_int(_rv, ==, 0); \ + const char *bind_addr = munit_parameters_get(params, "bind-address"); \ + if (bind_addr && strlen(bind_addr)) { \ + _rv = raft_uv_tcp_set_bind_address(&f->transport, bind_addr); \ + munit_assert_int(_rv, ==, 0); \ + } \ + const char *address = munit_parameters_get(params, "address"); \ + if (!address) { \ + address = "127.0.0.1:9000"; \ + } \ + _rv = f->transport.init(&f->transport, 1, address); \ + munit_assert_int(_rv, ==, 0); \ + f->transport.data = f; \ + f->closed = false; \ + } while (0) + +#define CLOSE \ + do { \ + f->transport.close(&f->transport, closeCb); \ + LOOP_RUN_UNTIL(&f->closed); \ + raft_uv_tcp_close(&f->transport); \ + } while (0) + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUpDeps(const MunitParameter params[], + MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SET_UP_ADDRINFO; + SET_UP_HEAP; + SETUP_LOOP; + SETUP_TCP; + return f; +} + +static void tearDownDeps(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_TCP; + TEAR_DOWN_LOOP; + TEAR_DOWN_HEAP; + TEAR_DOWN_ADDRINFO; + free(f); +} + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = setUpDeps(params, user_data); + void *cursor; + /* test_tcp_listen(&f->tcp); */ + INIT; + f->accepted = false; + f->handshake.offset = 0; + + cursor = f->handshake.buf; + bytePut64(&cursor, 1); + bytePut64(&cursor, PEER_ID); + bytePut64(&cursor, 16); + strcpy(cursor, PEER_ADDRESS); + + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + CLOSE; + tearDownDeps(f); +} + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +#define LISTEN(EXPECTED_RV) \ + do { \ + int rv; \ + rv = f->transport.listen(&f->transport, acceptCb); \ + munit_assert_int(rv, ==, EXPECTED_RV); \ + } while (false) + +/* Connect to the listening socket of the transport, creating a new connection + * that is waiting to be accepted. */ +#define PEER_CONNECT TCP_CLIENT_CONNECT(9000) + +/* Make the peer close the connection. */ +#define PEER_CLOSE TCP_CLIENT_CLOSE + +/* Make the connected client send handshake data. */ +#define PEER_HANDSHAKE \ + do { \ + size_t n = sizeof f->handshake.buf; \ + TCP_CLIENT_SEND(f->handshake.buf, n); \ + } while (0) + +/* Make the connected client send partial handshake data: only N bytes will be + * sent, starting from the offset of the last call. */ +#define PEER_HANDSHAKE_PARTIAL(N) \ + do { \ + TCP_CLIENT_SEND(f->handshake.buf + f->handshake.offset, N); \ + } while (0) + +/* After a PEER_CONNECT() call, spin the event loop until the connected + * callback of the listening TCP handle gets called. */ +#define LOOP_RUN_UNTIL_CONNECTED LOOP_RUN(1); + +/* After a PEER_HANDSHAKE_PARTIAL() call, spin the event loop until the read + * callback gets called. */ +#define LOOP_RUN_UNTIL_READ LOOP_RUN(1); + +/* Spin the event loop until the accept callback gets eventually invoked. */ +#define ACCEPT LOOP_RUN_UNTIL(&f->accepted); + +/****************************************************************************** + * + * Success scenarios. + * + *****************************************************************************/ + +SUITE(tcp_listen) + +/* Parameters for listen address */ + +static char *validAddresses[] = {"127.0.0.1:9000", "localhost:9000", NULL}; + +static char *validBindAddresses[] = { + "", "127.0.0.1:9000", "localhost:9000", ":9000", "0.0.0.0:9000", NULL}; + +static MunitParameterEnum validListenParams[] = { + {"address", validAddresses}, + {"bind-address", validBindAddresses}, + {NULL, NULL}, +}; + +/* If the handshake is successful, the accept callback is invoked. */ +TEST(tcp_listen, success, setUp, tearDown, 0, validListenParams) +{ + struct fixture *f = data; + LISTEN(0); + PEER_CONNECT; + PEER_HANDSHAKE; + ACCEPT; + return MUNIT_OK; +} + +/* Parameters for invalid listen addresses */ +static char *invalidAddresses[] = {"500.1.2.3:9000", "not-existing:9000", + "192.0.2.0:9000", NULL}; + +static char *invalidBindAddresses[] = { + "", "500.1.2.3:9000", "not-existing:9000", "192.0.2.0:9000", NULL}; + +static MunitParameterEnum invalidTcpListenParams[] = { + {"address", invalidAddresses}, + {"bind-address", invalidBindAddresses}, + {NULL, NULL}, +}; + +/* Check error on invalid hostname specified */ +TEST(tcp_listen, invalidAddress, setUp, tearDown, 0, invalidTcpListenParams) +{ + struct fixture *f = data; + LISTEN(RAFT_IOERR); + return MUNIT_OK; +} + +/* Check success with addrinfo resolve to mutiple IP and first one is used to + * connect */ +TEST(tcp_listen, firstOfTwo, setUp, tearDown, 0, NULL) +{ + const struct AddrinfoResult results[] = {{"127.0.0.1", 9000}, + {"127.0.0.2", 9000}}; + struct fixture *f = data; + AddrinfoInjectSetResponse(0, 2, results); + LISTEN(0); + PEER_CONNECT; + PEER_HANDSHAKE; + ACCEPT; + return MUNIT_OK; +} + +/* Check success with addrinfo resolve to mutiple IP and second one is used to + * connect */ +TEST(tcp_listen, secondOfTwo, setUp, tearDown, 0, NULL) +{ + const struct AddrinfoResult results[] = {{"127.0.0.2", 9000}, + {"127.0.0.1", 9000}}; + struct fixture *f = data; + AddrinfoInjectSetResponse(0, 2, results); + + LISTEN(0); + PEER_CONNECT; + PEER_HANDSHAKE; + ACCEPT; + return MUNIT_OK; +} + +/* Simulate port already in use error by addrinfo response contain the same IP + * twice */ +TEST(tcp_listen, alreadyBound, setUp, tearDown, 0, NULL) +{ + /* We need to use the same endpoint three times as a simple duplicate will + * be skipped due to a glib strange behavior + * https://bugzilla.redhat.com/show_bug.cgi?id=496300 */ + const struct AddrinfoResult results[] = { + {"127.0.0.1", 9000}, {"127.0.0.1", 9000}, {"127.0.0.1", 9000}}; + struct fixture *f = data; + AddrinfoInjectSetResponse(0, 3, results); + LISTEN(RAFT_IOERR); + return MUNIT_OK; +} + +/* Error in bind first IP address */ +TEST(tcp_listen, cannotBindFirst, setUp, tearDown, 0, NULL) +{ + const struct AddrinfoResult results[] = {{"192.0.2.0", 9000}, + {"127.0.0.1", 9000}}; + struct fixture *f = data; + AddrinfoInjectSetResponse(0, 2, results); + LISTEN(RAFT_IOERR); + return MUNIT_OK; +} + +/* Error in bind of second IP address */ +TEST(tcp_listen, cannotBindSecond, setUp, tearDown, 0, NULL) +{ + const struct AddrinfoResult results[] = {{"127.0.0.1", 9000}, + {"192.0.2.0", 9000}}; + struct fixture *f = data; + AddrinfoInjectSetResponse(0, 2, results); + LISTEN(RAFT_IOERR); + return MUNIT_OK; +} + +/* Check error on general dns server failure */ +TEST(tcp_listen, resolveFailure, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + AddrinfoInjectSetResponse(EAI_FAIL, 0, NULL); + LISTEN(RAFT_IOERR); + return MUNIT_OK; +} + +/* The client sends us a bad protocol version */ +TEST(tcp_listen, badProtocol, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + LISTEN(0); + memset(f->handshake.buf, 999, sizeof(uint64_t)); + PEER_CONNECT; + PEER_HANDSHAKE; + LOOP_RUN_UNTIL_CONNECTED; + LOOP_RUN_UNTIL_READ; + return MUNIT_OK; +} + +/* Parameters for sending a partial handshake */ +static char *partialHandshakeN[] = {"8", "16", "24", "32", NULL}; + +static MunitParameterEnum peerAbortParams[] = { + {"n", partialHandshakeN}, + {NULL, NULL}, +}; + +/* The peer closes the connection after having sent a partial handshake. */ +TEST(tcp_listen, peerAbort, setUp, tearDown, 0, peerAbortParams) +{ + struct fixture *f = data; + LISTEN(0); + const char *n = munit_parameters_get(params, "n"); + PEER_CONNECT; + PEER_HANDSHAKE_PARTIAL(atoi(n)); + LOOP_RUN_UNTIL_CONNECTED; + LOOP_RUN_UNTIL_READ; + PEER_CLOSE; + return MUNIT_OK; +} + +/* TODO: skip "2" because it makes libuv crash, as it calls abort(). See also + * https://github.com/libuv/libuv/issues/1948 */ +static char *oomHeapFaultDelay[] = {"0", "1", "3", NULL}; +static char *oomHeapFaultRepeat[] = {"1", NULL}; + +static MunitParameterEnum oomParams[] = { + {TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay}, + {TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat}, + {NULL, NULL}, +}; + +/* Out of memory conditions */ +TEST(tcp_listen, oom, setUp, tearDown, 0, oomParams) +{ + struct fixture *f = data; + LISTEN(0); + PEER_CONNECT; + PEER_HANDSHAKE; + HEAP_FAULT_ENABLE; + + /* Run as much as possible. */ + uv_run(&f->loop, UV_RUN_NOWAIT); + uv_run(&f->loop, UV_RUN_NOWAIT); + uv_run(&f->loop, UV_RUN_NOWAIT); + + return MUNIT_OK; +} + +/* Close the transport right after an incoming connection becomes pending, but + * it hasn't been accepted yet. */ +TEST(tcp_listen, pending, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + LISTEN(0); + PEER_CONNECT; + return MUNIT_OK; +} + +/* Close the transport right after an incoming connection gets accepted, and the + * peer hasn't sent handshake data yet. */ +TEST(tcp_listen, closeBeforeHandshake, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + LISTEN(0); + PEER_CONNECT; + LOOP_RUN_UNTIL_CONNECTED; + return MUNIT_OK; +} + +static MunitParameterEnum closeDuringHandshake[] = { + {"n", partialHandshakeN}, + {NULL, NULL}, +}; + +/* Close the transport right after the peer has started to send handshake data, + * but isn't done with it yet. */ +TEST(tcp_listen, handshake, setUp, tearDown, 0, closeDuringHandshake) +{ + struct fixture *f = data; + LISTEN(0); + const char *n_param = munit_parameters_get(params, "n"); + PEER_CONNECT; + PEER_HANDSHAKE_PARTIAL(atoi(n_param)); + LOOP_RUN_UNTIL_CONNECTED; + LOOP_RUN_UNTIL_READ; + return MUNIT_OK; +} diff --git a/test/raft/integration/test_uv_truncate.c b/test/raft/integration/test_uv_truncate.c new file mode 100644 index 000000000..b702d2669 --- /dev/null +++ b/test/raft/integration/test_uv_truncate.c @@ -0,0 +1,296 @@ +#include "../lib/runner.h" +#include "../lib/uv.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_UV_DEPS; + FIXTURE_UV; + int count; /* To generate deterministic entry data */ +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +struct result +{ + int status; + bool done; +}; + +static void appendCbAssertResult(struct raft_io_append *req, int status) +{ + struct result *result = req->data; + munit_assert_int(status, ==, result->status); + result->done = true; +} + +/* Declare and fill the entries array for the append request identified by + * I. The array will have N entries, and each entry will have a data buffer of + * SIZE bytes.*/ +#define ENTRIES(I, N, SIZE) \ + struct raft_entry _entries##I[N]; \ + uint8_t _entries_data##I[N * SIZE]; \ + do { \ + int _i; \ + for (_i = 0; _i < N; _i++) { \ + struct raft_entry *entry = &_entries##I[_i]; \ + entry->term = 1; \ + entry->type = RAFT_COMMAND; \ + entry->buf.base = &_entries_data##I[_i * SIZE]; \ + entry->buf.len = SIZE; \ + entry->batch = NULL; \ + munit_assert_ptr_not_null(entry->buf.base); \ + memset(entry->buf.base, 0, entry->buf.len); \ + f->count++; \ + *(uint64_t *)entry->buf.base = f->count; \ + } \ + } while (0) + +/* Submit an append request identified by I, with N_ENTRIES entries, each one of + * size ENTRY_SIZE). */ +#define APPEND_SUBMIT(I, N_ENTRIES, ENTRY_SIZE) \ + struct raft_io_append _req##I; \ + struct result _result##I = {0, false}; \ + int _rv##I; \ + ENTRIES(I, N_ENTRIES, ENTRY_SIZE); \ + _req##I.data = &_result##I; \ + _rv##I = f->io.append(&f->io, &_req##I, _entries##I, N_ENTRIES, \ + appendCbAssertResult); \ + munit_assert_int(_rv##I, ==, 0) + +/* Wait for the append request identified by I to complete. */ +#define APPEND_WAIT(I) LOOP_RUN_UNTIL(&_result##I.done) + +#define APPEND_EXPECT(I, STATUS) _result##I.status = STATUS + +/* Submit an append request and wait for it to successfully complete. */ +#define APPEND(N) \ + do { \ + APPEND_SUBMIT(9999, N, 8); \ + APPEND_WAIT(9999); \ + } while (0) + +#define TRUNCATE(N) \ + do { \ + int rv_; \ + rv_ = f->io.truncate(&f->io, N); \ + munit_assert_int(rv_, ==, 0); \ + } while (0) + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_UV_DEPS; + SETUP_UV; + f->count = 0; + return f; +} + +static void tearDownDeps(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_UV_DEPS; + free(f); +} + +/****************************************************************************** + * + * Assertions + * + *****************************************************************************/ + +/* Shutdown the fixture's raft_io instance, then load all entries on disk using + * a new raft_io instance, and assert that there are N entries with data + * matching the DATA array. */ +#define ASSERT_ENTRIES(N, ...) \ + TEAR_DOWN_UV; \ + do { \ + struct uv_loop_s _loop; \ + struct raft_uv_transport _transport; \ + struct raft_io _io; \ + raft_term _term; \ + raft_id _voted_for; \ + struct raft_snapshot *_snapshot; \ + raft_index _start_index; \ + struct raft_entry *_entries; \ + size_t _i; \ + size_t _n; \ + void *_batch = NULL; \ + unsigned _data[N] = {__VA_ARGS__}; \ + int _rv; \ + \ + _rv = uv_loop_init(&_loop); \ + munit_assert_int(_rv, ==, 0); \ + _transport.version = 1; \ + _rv = raft_uv_tcp_init(&_transport, &_loop); \ + munit_assert_int(_rv, ==, 0); \ + _rv = raft_uv_init(&_io, &_loop, f->dir, &_transport); \ + munit_assert_int(_rv, ==, 0); \ + _rv = _io.init(&_io, 1, "1"); \ + munit_assert_int(_rv, ==, 0); \ + _rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \ + &_entries, &_n); \ + munit_assert_int(_rv, ==, 0); \ + _io.close(&_io, NULL); \ + uv_run(&_loop, UV_RUN_NOWAIT); \ + raft_uv_close(&_io); \ + raft_uv_tcp_close(&_transport); \ + uv_loop_close(&_loop); \ + \ + munit_assert_ptr_null(_snapshot); \ + munit_assert_int(_n, ==, N); \ + for (_i = 0; _i < _n; _i++) { \ + struct raft_entry *_entry = &_entries[_i]; \ + uint64_t _value = *(uint64_t *)_entry->buf.base; \ + munit_assert_int(_entry->term, ==, 1); \ + munit_assert_int(_entry->type, ==, RAFT_COMMAND); \ + munit_assert_int(_value, ==, _data[_i]); \ + munit_assert_ptr_not_null(_entry->batch); \ + } \ + for (_i = 0; _i < _n; _i++) { \ + struct raft_entry *_entry = &_entries[_i]; \ + if (_entry->batch != _batch) { \ + _batch = _entry->batch; \ + raft_free(_batch); \ + } \ + } \ + raft_free(_entries); \ + } while (0); + +/****************************************************************************** + * + * raft_io->truncate() + * + *****************************************************************************/ + +SUITE(truncate) + +/* If the index to truncate is at the start of a segment, that segment and all + * subsequent ones are removed. */ +TEST(truncate, wholeSegment, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND(3); + TRUNCATE(1); + APPEND(1); + ASSERT_ENTRIES(1 /* n entries */, 4 /* entries data */); + return MUNIT_OK; +} + +/* The index to truncate is the same as the last appended entry. */ +TEST(truncate, sameAsLastIndex, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND(3); + TRUNCATE(3); + APPEND(1); + ASSERT_ENTRIES(3 /* n entries */, 1, 2, 4 /* entries data */); + return MUNIT_OK; +} + +/* If the index to truncate is not at the start of a segment, that segment gets + * truncated. */ +TEST(truncate, partialSegment, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND(3); + APPEND(1); + TRUNCATE(2); + APPEND(1); + ASSERT_ENTRIES(2, /* n entries */ + 1, 5 /* entries data */ + ); + return MUNIT_OK; +} + +/* The truncate request is issued while an append request is still pending. */ +TEST(truncate, pendingAppend, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND_SUBMIT(0, /* request ID */ + 3, /* n entries */ + 8 /* entry size */ + ); + TRUNCATE(2 /* truncation index */); + APPEND(1); + ASSERT_ENTRIES(2, /* n entries */ + 1, 4 /* entries data */ + ); + return MUNIT_OK; +} + +/* Multiple truncate requests pending at the same time. */ +TEST(truncate, multiplePending, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND_SUBMIT(0, /* request ID */ + 3, /* n entries */ + 8 /* entry size */ + ); + TRUNCATE(2 /* truncation index */); + APPEND_SUBMIT(1, /* request ID */ + 2, /* n entries */ + 8 /* entry size */ + ); + TRUNCATE(3 /* truncation index */); + APPEND(1); + ASSERT_ENTRIES(3, /* n entries */ + 1, 4, 6 /* entries data */ + ); + return MUNIT_OK; +} + +/* The truncate request gets canceled because we're closing. */ +TEST(truncate, closing, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND_SUBMIT(0, /* request ID */ + 3, /* n entries */ + 8 /* entry size */ + ); + TRUNCATE(2 /* truncation index */); + APPEND_EXPECT(0, /* request ID */ + RAFT_CANCELED /* status */ + ); + TEAR_DOWN_UV; + return MUNIT_OK; +} + +/* Multiple truncate requests get canceled because we're closing. */ +TEST(truncate, closingMultiple, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND_SUBMIT(0, /* request ID */ + 3, /* n entries */ + 8 /* entry size */ + ); + TRUNCATE(2 /* truncation index */); + APPEND_SUBMIT(1, /* request ID */ + 2, /* n entries */ + 8 /* entry size */ + ); + TRUNCATE(3 /* truncation index */); + APPEND_EXPECT(0, /* request ID */ + RAFT_CANCELED /* status */ + ); + APPEND_EXPECT(1, /* request ID */ + RAFT_CANCELED /* status */ + ); + TEAR_DOWN_UV; + return MUNIT_OK; +} diff --git a/test/raft/integration/test_uv_truncate_snapshot.c b/test/raft/integration/test_uv_truncate_snapshot.c new file mode 100644 index 000000000..adbe88398 --- /dev/null +++ b/test/raft/integration/test_uv_truncate_snapshot.c @@ -0,0 +1,244 @@ +#include "../lib/runner.h" +#include "../lib/uv.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_UV_DEPS; + FIXTURE_UV; + int count; /* To generate deterministic entry data */ +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +/* Maximum number of blocks a segment can have */ +#define MAX_SEGMENT_BLOCKS 4 + +/* This block size should work fine for all file systems. */ +#define SEGMENT_BLOCK_SIZE 4096 + +/* Default segment size */ +#define SEGMENT_SIZE 4096 * MAX_SEGMENT_BLOCKS + +struct result +{ + int status; + bool done; + void *data; +}; + +static void appendCbAssertResult(struct raft_io_append *req, int status) +{ + struct result *result = req->data; + munit_assert_int(status, ==, result->status); + result->done = true; +} + +static void snapshotPutCbAssertResult(struct raft_io_snapshot_put *req, + int status) +{ + struct result *result = req->data; + munit_assert_int(status, ==, result->status); + result->done = true; +} + +/* Declare and fill the entries array for the append request identified by + * I. The array will have N entries, and each entry will have a data buffer of + * SIZE bytes.*/ +#define ENTRIES(I, N, SIZE) \ + struct raft_entry _entries##I[N]; \ + uint8_t _entries_data##I[N * SIZE]; \ + do { \ + int _i; \ + for (_i = 0; _i < N; _i++) { \ + struct raft_entry *entry = &_entries##I[_i]; \ + entry->term = 1; \ + entry->type = RAFT_COMMAND; \ + entry->buf.base = &_entries_data##I[_i * SIZE]; \ + entry->buf.len = SIZE; \ + entry->batch = NULL; \ + munit_assert_ptr_not_null(entry->buf.base); \ + memset(entry->buf.base, 0, entry->buf.len); \ + f->count++; \ + *(uint64_t *)entry->buf.base = f->count; \ + } \ + } while (0) + +/* Submit an append request identified by I, with N_ENTRIES entries, each one of + * size ENTRY_SIZE). */ +#define APPEND_SUBMIT(I, N_ENTRIES, ENTRY_SIZE) \ + struct raft_io_append _req##I; \ + struct result _result##I = {0, false, NULL}; \ + int _rv##I; \ + ENTRIES(I, N_ENTRIES, ENTRY_SIZE); \ + _req##I.data = &_result##I; \ + _rv##I = f->io.append(&f->io, &_req##I, _entries##I, N_ENTRIES, \ + appendCbAssertResult); \ + munit_assert_int(_rv##I, ==, 0) + +#define TRUNCATE(N) \ + do { \ + int rv_; \ + rv_ = f->io.truncate(&f->io, N); \ + munit_assert_int(rv_, ==, 0); \ + } while (0) + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_UV_DEPS; + SETUP_UV; + raft_uv_set_block_size(&f->io, SEGMENT_BLOCK_SIZE); + raft_uv_set_segment_size(&f->io, SEGMENT_SIZE); + f->count = 0; + return f; +} + +static void tearDownDeps(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_UV_DEPS; + free(f); +} + +/****************************************************************************** + * + * Assertions + * + *****************************************************************************/ + +/* Shutdown the fixture's raft_io instance, then load all entries on disk using + * a new raft_io instance, and assert that there are N entries with data + * matching the DATA array. */ +#define ASSERT_ENTRIES(N, ...) \ + TEAR_DOWN_UV; \ + do { \ + struct uv_loop_s _loop; \ + struct raft_uv_transport _transport; \ + struct raft_io _io; \ + raft_term _term; \ + raft_id _voted_for; \ + struct raft_snapshot *_snap; \ + raft_index _start_index; \ + struct raft_entry *_entries; \ + size_t _i; \ + size_t _n; \ + void *_batch = NULL; \ + unsigned _data[N] = {__VA_ARGS__}; \ + int _ret; \ + \ + _ret = uv_loop_init(&_loop); \ + munit_assert_int(_ret, ==, 0); \ + _transport.version = 1; \ + _ret = raft_uv_tcp_init(&_transport, &_loop); \ + munit_assert_int(_ret, ==, 0); \ + _ret = raft_uv_init(&_io, &_loop, f->dir, &_transport); \ + munit_assert_int(_ret, ==, 0); \ + _ret = _io.init(&_io, 1, "1"); \ + munit_assert_int(_ret, ==, 0); \ + _ret = _io.load(&_io, &_term, &_voted_for, &_snap, &_start_index, \ + &_entries, &_n); \ + munit_assert_int(_ret, ==, 0); \ + _io.close(&_io, NULL); \ + uv_run(&_loop, UV_RUN_NOWAIT); \ + raft_uv_close(&_io); \ + raft_uv_tcp_close(&_transport); \ + uv_loop_close(&_loop); \ + \ + munit_assert_size(_n, ==, N); \ + for (_i = 0; _i < _n; _i++) { \ + struct raft_entry *_entry = &_entries[_i]; \ + uint64_t _value = *(uint64_t *)_entry->buf.base; \ + munit_assert_int(_entry->term, ==, 1); \ + munit_assert_int(_entry->type, ==, RAFT_COMMAND); \ + munit_assert_int(_value, ==, _data[_i]); \ + munit_assert_ptr_not_null(_entry->batch); \ + } \ + for (_i = 0; _i < _n; _i++) { \ + struct raft_entry *_entry = &_entries[_i]; \ + if (_entry->batch != _batch) { \ + _batch = _entry->batch; \ + raft_free(_batch); \ + } \ + } \ + raft_free(_entries); \ + if (_snap != NULL) { \ + raft_configuration_close(&_snap->configuration); \ + munit_assert_int(_snap->n_bufs, ==, 1); \ + raft_free(_snap->bufs[0].base); \ + raft_free(_snap->bufs); \ + raft_free(_snap); \ + } \ + } while (0); + +#define SNAPSHOT_PUT_REQ(TRAILING, INDEX, RV, STATUS) \ + struct raft_snapshot _snapshot; \ + struct raft_buffer _snapshot_buf; \ + uint64_t _snapshot_data; \ + struct raft_io_snapshot_put _req; \ + struct result _result = {STATUS, false, NULL}; \ + int _rv; \ + _snapshot.term = 1; \ + _snapshot.index = INDEX; \ + raft_configuration_init(&_snapshot.configuration); \ + _rv = raft_configuration_add(&_snapshot.configuration, 1, "1", \ + RAFT_STANDBY); \ + munit_assert_int(_rv, ==, 0); \ + _snapshot.bufs = &_snapshot_buf; \ + _snapshot.n_bufs = 1; \ + _snapshot_buf.base = &_snapshot_data; \ + _snapshot_buf.len = sizeof _snapshot_data; \ + _req.data = &_result; \ + _rv = f->io.snapshot_put(&f->io, TRAILING, &_req, &_snapshot, \ + snapshotPutCbAssertResult); \ + munit_assert_int(_rv, ==, RV) + +#define SNAPSHOT_CLEANUP() raft_configuration_close(&_snapshot.configuration) + +/****************************************************************************** + * + * test interaction of raft_io->snapshot_put and raft_io->truncate() + * + *****************************************************************************/ + +SUITE(snapshot_truncate) + +/* Fill up 3 segments worth of data, then take a snapshot. + * While the snapshot is taken, start a truncate request. */ +TEST(snapshot_truncate, snapshotThenTruncate, setUp, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + APPEND_SUBMIT(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); + APPEND_SUBMIT(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); + APPEND_SUBMIT(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE); + + /* Take a snapshot, this will use a uv_barrier. */ + SNAPSHOT_PUT_REQ(8192, 6, 0, 0); + + /* Truncate, this will use a uv_barrier too. */ + TRUNCATE(8); + + /* There's no truncate callback to wait for, loop for a while. */ + LOOP_RUN(1000); + + /* Check that truncate has done its job. */ + ASSERT_ENTRIES(7, 1, 2, 3, 4, 5, 6, 7); + + SNAPSHOT_CLEANUP(); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_uv_work.c b/test/raft/integration/test_uv_work.c new file mode 100644 index 000000000..14bfc41da --- /dev/null +++ b/test/raft/integration/test_uv_work.c @@ -0,0 +1,103 @@ +#include + +#include "../../../src/raft/uv.h" +#include "../lib/dir.h" +#include "../lib/loop.h" +#include "../lib/runner.h" +#include "../lib/uv.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_UV_DEPS; + FIXTURE_UV; +}; + +struct result +{ + int rv; /* Indicate success or failure of the work */ + int counter; /* Proof that work was performed */ + bool done; /* To check test termination */ +}; + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_UV_DEPS; + SETUP_UV; + return f; +} + +static void tearDownDeps(void *data) +{ + struct fixture *f = data; + if (f == NULL) { + return; + } + TEAR_DOWN_UV_DEPS; + free(f); +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + if (f == NULL) { + return; + } + TEAR_DOWN_UV; + tearDownDeps(f); +} + +/****************************************************************************** + * + * UvAsyncWork + * + *****************************************************************************/ + +static void asyncWorkCbAssertResult(struct raft_io_async_work *req, int status) +{ + struct result *r = req->data; + munit_assert_int(status, ==, r->rv); + munit_assert_int(r->counter, ==, 1); + r->done = true; +} + +static int asyncWorkFn(struct raft_io_async_work *req) +{ + struct result *r = req->data; + sleep(1); + r->counter = 1; + return r->rv; +} + +SUITE(UvAsyncWork) + +static char *rvs[] = {"-1", "0", "1", "37", NULL}; +static MunitParameterEnum rvs_params[] = { + {"rv", rvs}, + {NULL, NULL}, +}; + +TEST(UvAsyncWork, work, setUp, tearDown, 0, rvs_params) +{ + struct fixture *f = data; + struct result res = {0}; + struct raft_io_async_work req = {0}; + res.rv = (int)strtol(munit_parameters_get(params, "rv"), NULL, 0); + req.data = &res; + req.work = asyncWorkFn; + UvAsyncWork(&f->io, &req, asyncWorkCbAssertResult); + LOOP_RUN_UNTIL(&res.done); + return MUNIT_OK; +} diff --git a/test/raft/integration/test_voter_contacts.c b/test/raft/integration/test_voter_contacts.c new file mode 100644 index 000000000..ab6405db0 --- /dev/null +++ b/test/raft/integration/test_voter_contacts.c @@ -0,0 +1,105 @@ +#include "../lib/cluster.h" +#include "../lib/runner.h" + +#define N_SERVERS 3 + +/****************************************************************************** + * + * Fixture with a test raft cluster. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_CLUSTER; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +#define STEP_N(N) raft_fixture_step_n(&f->cluster, N) + +/****************************************************************************** + * + * Set up a cluster with a three servers. + * + *****************************************************************************/ + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SETUP_CLUSTER(N_SERVERS); + CLUSTER_BOOTSTRAP; + CLUSTER_START; + CLUSTER_ELECT(0); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_CLUSTER; + free(f); +} + +/****************************************************************************** + * + * raft_voter_contacts + * + *****************************************************************************/ + +SUITE(raft_voter_contacts) + +TEST(raft_voter_contacts, upToDate, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + + CLUSTER_STEP_UNTIL_HAS_LEADER(1000); + CLUSTER_STEP_N(1000); + + /* N node cluster with leader */ + for (unsigned int i = 0; i < N_SERVERS; i++) { + int count = raft_voter_contacts(CLUSTER_RAFT(i)); + if (i == CLUSTER_LEADER) { + munit_assert_int(count, ==, N_SERVERS); + } else { + munit_assert_int(count, ==, -1); + } + } + + /* Kill the cluster leader, so a new leader is elected and the number of + * voters should be decreased */ + unsigned int leader = CLUSTER_LEADER; + CLUSTER_KILL(leader); + CLUSTER_STEP_UNTIL_HAS_LEADER(1000); + CLUSTER_STEP_N(1000); + + for (unsigned int i = 0; i < N_SERVERS; i++) { + if (i == leader) { + continue; + } + int count = raft_voter_contacts(CLUSTER_RAFT(i)); + if (i == CLUSTER_LEADER) { + munit_assert_int(count, ==, N_SERVERS - 1); + } else { + munit_assert_int(count, ==, -1); + } + } + + /* Revive the old leader, so the count should go back up */ + CLUSTER_REVIVE(leader); + CLUSTER_STEP_N(1000); + for (unsigned int i = 0; i < N_SERVERS; i++) { + int count = raft_voter_contacts(CLUSTER_RAFT(i)); + if (i == CLUSTER_LEADER) { + munit_assert_int(count, ==, N_SERVERS); + } else { + munit_assert_int(count, ==, -1); + } + } + + return MUNIT_OK; +} diff --git a/test/raft/lib/addrinfo.c b/test/raft/lib/addrinfo.c new file mode 100644 index 000000000..532ddab5f --- /dev/null +++ b/test/raft/lib/addrinfo.c @@ -0,0 +1,173 @@ +#include "addrinfo.h" + +#include + +#include +#include +#include +#include + +bool addrinfo_mock_enabled = false; + +enum addrinfo_mock_state { MockResultSet, MockResultReturned, SystemResult }; + +struct addrinfo_mock_data +{ + enum addrinfo_mock_state state; + int rv; + struct addrinfo *result; + struct addrinfo_mock_data *next; +}; + +static struct addrinfo_mock_data *addrinfo_data; + +void AddrinfoInjectSetUp(MUNIT_UNUSED const MunitParameter params[]) +{ + munit_assert_int(addrinfo_mock_enabled, ==, false); + munit_assert_ptr((void *)addrinfo_data, ==, NULL); + addrinfo_mock_enabled = true; +} + +void AddrinfoInjectTearDown(void) +{ + munit_assert_int(addrinfo_mock_enabled, ==, true); + // If data is not freed the freeaddrinfo was not invoked. + munit_assert_ptr((void *)addrinfo_data, ==, NULL); + addrinfo_mock_enabled = false; +} + +void AddrinfoInjectSetResponse(int rv, + int num_results, + const struct AddrinfoResult *results) +{ + munit_assert_int(addrinfo_mock_enabled, ==, true); + munit_assert(!addrinfo_data || addrinfo_data->state == MockResultReturned); + munit_assert(rv || (num_results && results)); + + struct addrinfo_mock_data *response = + malloc(sizeof(struct addrinfo_mock_data)); + munit_assert_ptr((void *)response, !=, NULL); + response->state = MockResultSet; + response->rv = rv; + response->result = NULL; + for (int i = num_results - 1; i >= 0; --i) { + struct sockaddr_in *addr_in = malloc(sizeof(struct sockaddr_in)); + munit_assert_ptr((void *)addr_in, !=, NULL); + munit_assert_int(uv_ip4_addr(results[i].ip, results[i].port, addr_in), + ==, 0); + + struct addrinfo *ai = malloc(sizeof(struct addrinfo)); + munit_assert_ptr((void *)ai, !=, NULL); + ai->ai_flags = 0; + ai->ai_family = AF_INET; + ai->ai_socktype = SOCK_STREAM; + ai->ai_protocol = IPPROTO_TCP; + ai->ai_addrlen = sizeof(struct sockaddr_in); + ai->ai_addr = (struct sockaddr *)addr_in; + ai->ai_canonname = NULL; + ai->ai_next = response->result; + response->result = ai; + } + response->next = addrinfo_data; + addrinfo_data = response; +} + +static int invoke_system_getaddrinfo(const char *node, + const char *service, + const struct addrinfo *hints, + struct addrinfo **res) +{ + int (*system_getaddrinfo)(const char *node, const char *service, + const struct addrinfo *hints, + struct addrinfo **res); + *(void **)(&system_getaddrinfo) = dlsym(RTLD_NEXT, "getaddrinfo"); + munit_assert_ptr(*(void **)&system_getaddrinfo, !=, NULL); + return (*system_getaddrinfo)(node, service, hints, res); +} + +int getaddrinfo(const char *node, + const char *service, + const struct addrinfo *hints, + struct addrinfo **res) +{ + int rv; + + if (!addrinfo_mock_enabled) { + return invoke_system_getaddrinfo(node, service, hints, res); + } + if (!addrinfo_data || addrinfo_data->state == SystemResult) { + /* We have not injected response, invoke system function */ + rv = invoke_system_getaddrinfo(node, service, hints, res); + if (!rv) { + /* Store result for check on freeaddrinfo */ + struct addrinfo_mock_data *response = + malloc(sizeof(struct addrinfo_mock_data)); + munit_assert_ptr((void *)response, !=, NULL); + response->state = SystemResult; + response->rv = rv; + response->result = *res; + response->next = addrinfo_data; + addrinfo_data = response; + } + return rv; + } + if (addrinfo_data) { + munit_assert_int(addrinfo_data->state, ==, MockResultSet); + addrinfo_data->state = MockResultReturned; + rv = addrinfo_data->rv; + if (!rv) { + *res = addrinfo_data->result; + } else { + *res = NULL; + struct addrinfo_mock_data *response = addrinfo_data; + munit_assert_ptr((void *)response->result, ==, NULL); + addrinfo_data = response->next; + free(response); + } + return rv; + } + return EAI_FAIL; +} + +static void invoke_system_freeaddrinfo(struct addrinfo *res) +{ + int (*system_freeaddrinfo)(struct addrinfo * res); + *(void **)(&system_freeaddrinfo) = dlsym(RTLD_NEXT, "freeaddrinfo"); + munit_assert_ptr(*(void **)&system_freeaddrinfo, !=, NULL); + (*system_freeaddrinfo)(res); +} + +void freeaddrinfo(struct addrinfo *res) +{ + struct addrinfo_mock_data **ptr; + struct addrinfo_mock_data *response; + + // freeaddrinfo should not be invoked with a NULL pointer + munit_assert_ptr((void *)res, !=, NULL); + + if (!addrinfo_mock_enabled) { + invoke_system_freeaddrinfo(res); + return; + } + for (ptr = &addrinfo_data; *ptr; ptr = &((*ptr)->next)) { + if ((*ptr)->result == res) { + break; + } + } + response = *ptr; + munit_assert_ptr((void *)response, !=, NULL); + *ptr = response->next; + if (response->state == SystemResult) { + invoke_system_freeaddrinfo(response->result); + } else { + munit_assert_int(response->state, ==, MockResultReturned); + res = response->result; + while (res) { + struct addrinfo *next = res->ai_next; + free(res->ai_addr); + free(res); + res = next; + } + } + free(response); +} diff --git a/test/raft/lib/addrinfo.h b/test/raft/lib/addrinfo.h new file mode 100644 index 000000000..cc29d5864 --- /dev/null +++ b/test/raft/lib/addrinfo.h @@ -0,0 +1,35 @@ +/* Support for getaddrinfo injection for test purpose + * + * Provide a local bound version to capture teh getaddrinfo/freeaddrinfo + * incovation The helper may operate in three different modes: a) Transparent + * forward calls to system getaddrinfo/freeaddrinfo function, if the + * SET_UP_ADDRINFO/TEAR_DOWN_ADDRINFO is not added to the test test case setup + * teardown. b) Check, if all results requested by getaddrinfo are freed using + * freeaddrinfo. Activated by adding the SET_UP_ADDRINFO/SET_UP_ADDRINFO macros + * to the test fixture. c) Inject artifical responses into the the getaddrinfo + * requests for test purpose additionally to b) by using + * AddrinfoInjectSetResponse before triggering the getaddrinfo calls. + */ + +#ifndef TEST_ADDRINFO_H +#define TEST_ADDRINFO_H + +#include "munit.h" + +#define SET_UP_ADDRINFO AddrinfoInjectSetUp(params) +#define TEAR_DOWN_ADDRINFO AddrinfoInjectTearDown() + +typedef struct AddrinfoResult +{ + const char *ip; + const int port; +} AddrinfoResult_t; + +void AddrinfoInjectSetResponse(int rv, + int num_results, + const struct AddrinfoResult *results); + +void AddrinfoInjectSetUp(const MunitParameter params[]); +void AddrinfoInjectTearDown(void); + +#endif // #ifndef TEST_ADDRINFO_H diff --git a/test/raft/lib/aio.c b/test/raft/lib/aio.c new file mode 100644 index 000000000..c731b734b --- /dev/null +++ b/test/raft/lib/aio.c @@ -0,0 +1,66 @@ +#include "aio.h" + +#include +#include +#include +#include + +#include "munit.h" + +int AioFill(aio_context_t *ctx, unsigned n) +{ + char buf[256]; + int fd; + int rv; + int limit; + int used; + + /* Figure out how many events are available. */ + fd = open("/proc/sys/fs/aio-max-nr", O_RDONLY); + munit_assert_int(fd, !=, -1); + + rv = read(fd, buf, sizeof buf); + munit_assert_int(rv, !=, -1); + + close(fd); + + limit = atoi(buf); + munit_assert_int(limit, >, 0); + + /* Figure out how many events are in use. */ + fd = open("/proc/sys/fs/aio-nr", O_RDONLY); + munit_assert_int(fd, !=, -1); + + rv = read(fd, buf, sizeof buf); + munit_assert_int(rv, !=, -1); + + close(fd); + + used = atoi(buf); + munit_assert_int(used, >=, 0); + + /* Best effort check that nothing process is using AIO. Our own unit tests + * case use up to 2 event slots at the time this function is called, so we + * don't consider those. */ + if (used > 2) { + return -1; + } + + rv = syscall(__NR_io_setup, limit - used - n, ctx); + if (rv != 0) { + /* The `limit - used - n` calculation is racy and io_setup can fail with + * EAGAIN if in meantime another proces has reserved some events */ + munit_assert_int(errno, ==, EAGAIN); + return -1; + } + + return 0; +} + +void AioDestroy(aio_context_t ctx) +{ + int rv; + + rv = syscall(__NR_io_destroy, ctx); + munit_assert_int(rv, ==, 0); +} diff --git a/test/raft/lib/aio.h b/test/raft/lib/aio.h new file mode 100644 index 000000000..f4540e0f6 --- /dev/null +++ b/test/raft/lib/aio.h @@ -0,0 +1,19 @@ +/* Utilities around the Kernel AIO sub-system. */ +#ifndef TEST_AIO_H +#define TEST_AIO_H + +#include + +/* Fill the AIO subsystem resources by allocating a lot of events to the given + * context, and leaving only @n events available for subsequent calls to + * @io_setup. + * + * Return -1 if it looks like there is another process already using the AIO + * subsystem, which would most probably make the calling test flaky because + * there won't be exactly @n events available anymore. */ +int AioFill(aio_context_t *ctx, unsigned n); + +/* Destroy the given AIO context. */ +void AioDestroy(aio_context_t ctx); + +#endif /* TEST_AIO_H */ diff --git a/test/raft/lib/cluster.c b/test/raft/lib/cluster.c new file mode 100644 index 000000000..68190389d --- /dev/null +++ b/test/raft/lib/cluster.c @@ -0,0 +1,45 @@ +#include "cluster.h" + +static void randomize(struct raft_fixture *f, unsigned i, int what) +{ + struct raft *raft = raft_fixture_get(f, i); + switch (what) { + case RAFT_FIXTURE_TICK: + /* TODO: provide an API to inspect how much time has elapsed since + * the last election timer reset */ + if (raft->election_timer_start == raft->io->time(raft->io)) { + raft_fixture_set_randomized_election_timeout( + f, i, + munit_rand_int_range(raft->election_timeout, + raft->election_timeout * 2)); + } + break; + case RAFT_FIXTURE_DISK: + raft_fixture_set_disk_latency(f, i, munit_rand_int_range(10, 25)); + break; + case RAFT_FIXTURE_NETWORK: + raft_fixture_set_network_latency(f, i, + munit_rand_int_range(25, 50)); + break; + default: + munit_assert(0); + break; + } +} + +void cluster_randomize_init(struct raft_fixture *f) +{ + unsigned i; + for (i = 0; i < raft_fixture_n(f); i++) { + randomize(f, i, RAFT_FIXTURE_TICK); + randomize(f, i, RAFT_FIXTURE_DISK); + randomize(f, i, RAFT_FIXTURE_NETWORK); + } +} + +void cluster_randomize(struct raft_fixture *f, struct raft_fixture_event *event) +{ + unsigned index = raft_fixture_event_server_index(event); + int type = raft_fixture_event_type(event); + randomize(f, index, type); +} diff --git a/test/raft/lib/cluster.h b/test/raft/lib/cluster.h new file mode 100644 index 000000000..602424d17 --- /dev/null +++ b/test/raft/lib/cluster.h @@ -0,0 +1,436 @@ +/* Setup and drive a test raft cluster. */ + +#ifndef TEST_CLUSTER_H +#define TEST_CLUSTER_H + +#include + +#include "../../../src/raft.h" +#include "fsm.h" +#include "heap.h" +#include "munit.h" +#include "snapshot.h" + +#define FIXTURE_CLUSTER \ + FIXTURE_HEAP; \ + struct raft_fsm fsms[RAFT_FIXTURE_MAX_SERVERS]; \ + struct raft_fixture cluster + +/* N is the default number of servers, but can be tweaked with the cluster-n + * parameter. */ +#define SETUP_CLUSTER(DEFAULT_N) \ + SET_UP_HEAP; \ + do { \ + unsigned _n = DEFAULT_N; \ + bool _pre_vote = false; \ + bool _ss_async = false; \ + int _fsm_version = 3; \ + unsigned _hb = 0; \ + unsigned _i; \ + int _rv; \ + if (munit_parameters_get(params, CLUSTER_N_PARAM) != NULL) { \ + _n = atoi(munit_parameters_get(params, CLUSTER_N_PARAM)); \ + } \ + if (munit_parameters_get(params, CLUSTER_PRE_VOTE_PARAM) != NULL) { \ + _pre_vote = \ + atoi(munit_parameters_get(params, CLUSTER_PRE_VOTE_PARAM)); \ + } \ + if (munit_parameters_get(params, CLUSTER_HEARTBEAT_PARAM) != NULL) { \ + _hb = atoi(munit_parameters_get(params, CLUSTER_HEARTBEAT_PARAM)); \ + } \ + if (munit_parameters_get(params, CLUSTER_SS_ASYNC_PARAM) != NULL) { \ + _ss_async = \ + atoi(munit_parameters_get(params, CLUSTER_SS_ASYNC_PARAM)); \ + } \ + if (munit_parameters_get(params, CLUSTER_FSM_VERSION_PARAM) != NULL) { \ + _fsm_version = \ + atoi(munit_parameters_get(params, CLUSTER_FSM_VERSION_PARAM)); \ + } \ + munit_assert_int(_n, >, 0); \ + _rv = raft_fixture_init(&f->cluster); \ + munit_assert_int(_rv, ==, 0); \ + for (_i = 0; _i < _n; _i++) { \ + if (!_ss_async || _fsm_version < 3) { \ + FsmInit(&f->fsms[_i], _fsm_version); \ + } else { \ + FsmInitAsync(&f->fsms[_i], _fsm_version); \ + } \ + _rv = raft_fixture_grow(&f->cluster, &f->fsms[_i]); \ + munit_assert_int(_rv, ==, 0); \ + } \ + for (_i = 0; _i < _n; _i++) { \ + raft_set_pre_vote(raft_fixture_get(&f->cluster, _i), _pre_vote); \ + if (_hb) { \ + raft_set_heartbeat_timeout(raft_fixture_get(&f->cluster, _i), \ + _hb); \ + } \ + } \ + } while (0) + +#define TEAR_DOWN_CLUSTER \ + do { \ + unsigned i; \ + raft_fixture_close(&f->cluster); \ + for (i = 0; i < CLUSTER_N; i++) { \ + FsmClose(&f->fsms[i]); \ + } \ + } while (0); \ + TEAR_DOWN_HEAP; + +/* Munit parameter for setting the number of servers */ +#define CLUSTER_N_PARAM "cluster-n" + +/* Munit parameter for setting the number of voting servers */ +#define CLUSTER_N_VOTING_PARAM "cluster-n-voting" + +/* Munit parameter for enabling pre-vote */ +#define CLUSTER_PRE_VOTE_PARAM "cluster-pre-vote" + +/* Munit parameter for setting HeartBeat timeout */ +#define CLUSTER_HEARTBEAT_PARAM "cluster-heartbeat" + +/* Munit parameter for setting snapshot behaviour */ +#define CLUSTER_SS_ASYNC_PARAM "cluster-snapshot-async" + +/* Munit parameter for setting fsm version */ +#define CLUSTER_FSM_VERSION_PARAM "fsm-version" + +/* Get the number of servers in the cluster. */ +#define CLUSTER_N raft_fixture_n(&f->cluster) + +/* Get the cluster time. */ +#define CLUSTER_TIME raft_fixture_time(&f->cluster) + +/* Index of the current leader, or CLUSTER_N if there's no leader. */ +#define CLUSTER_LEADER raft_fixture_leader_index(&f->cluster) + +/* True if the cluster has a leader. */ +#define CLUSTER_HAS_LEADER CLUSTER_LEADER < CLUSTER_N + +/* Get the struct raft object of the I'th server. */ +#define CLUSTER_RAFT(I) raft_fixture_get(&f->cluster, I) + +/* Get the state of the I'th server. */ +#define CLUSTER_STATE(I) raft_state(raft_fixture_get(&f->cluster, I)) + +/* Get the current term of the I'th server. */ +#define CLUSTER_TERM(I) raft_fixture_get(&f->cluster, I)->current_term + +/* Get the struct fsm object of the I'th server. */ +#define CLUSTER_FSM(I) &f->fsms[I] + +/* Return the last applied index on the I'th server. */ +#define CLUSTER_LAST_APPLIED(I) \ + raft_last_applied(raft_fixture_get(&f->cluster, I)) + +/* Return the ID of the server the I'th server has voted for. */ +#define CLUSTER_VOTED_FOR(I) raft_fixture_voted_for(&f->cluster, I) + +/* Return a description of the last error occurred on the I'th server. */ +#define CLUSTER_ERRMSG(I) raft_errmsg(CLUSTER_RAFT(I)) + +/* Populate the given configuration with all servers in the fixture. All servers + * will be voting. */ +#define CLUSTER_CONFIGURATION(CONF) \ + { \ + int rv_; \ + rv_ = raft_fixture_configuration(&f->cluster, CLUSTER_N, CONF); \ + munit_assert_int(rv_, ==, 0); \ + } + +/* Bootstrap all servers in the cluster. All servers will be voting, unless the + * cluster-n-voting parameter is used. */ +#define CLUSTER_BOOTSTRAP \ + { \ + unsigned n_ = CLUSTER_N; \ + int rv_; \ + struct raft_configuration configuration; \ + if (munit_parameters_get(params, CLUSTER_N_VOTING_PARAM) != NULL) { \ + n_ = atoi(munit_parameters_get(params, CLUSTER_N_VOTING_PARAM)); \ + } \ + rv_ = raft_fixture_configuration(&f->cluster, n_, &configuration); \ + munit_assert_int(rv_, ==, 0); \ + rv_ = raft_fixture_bootstrap(&f->cluster, &configuration); \ + munit_assert_int(rv_, ==, 0); \ + raft_configuration_close(&configuration); \ + } + +/* Bootstrap all servers in the cluster. Only the first N servers will be + * voting. */ +#define CLUSTER_BOOTSTRAP_N_VOTING(N) \ + { \ + int rv_; \ + struct raft_configuration configuration_; \ + rv_ = raft_fixture_configuration(&f->cluster, N, &configuration_); \ + munit_assert_int(rv_, ==, 0); \ + rv_ = raft_fixture_bootstrap(&f->cluster, &configuration_); \ + munit_assert_int(rv_, ==, 0); \ + raft_configuration_close(&configuration_); \ + } + +/* Start all servers in the test cluster. */ +#define CLUSTER_START \ + { \ + int rc; \ + rc = raft_fixture_start(&f->cluster); \ + munit_assert_int(rc, ==, 0); \ + } + +/* Step the cluster. */ +#define CLUSTER_STEP raft_fixture_step(&f->cluster); + +/* Step the cluster N times. */ +#define CLUSTER_STEP_N(N) \ + { \ + unsigned i_; \ + for (i_ = 0; i_ < N; i_++) { \ + raft_fixture_step(&f->cluster); \ + } \ + } + +/* Step until the given function becomes true. */ +#define CLUSTER_STEP_UNTIL(FUNC, ARG, MSECS) \ + { \ + bool done_; \ + done_ = raft_fixture_step_until(&f->cluster, FUNC, ARG, MSECS); \ + munit_assert_true(done_); \ + } + +/* Step the cluster until a leader is elected or #MAX_MSECS have elapsed. */ +#define CLUSTER_STEP_UNTIL_ELAPSED(MSECS) \ + raft_fixture_step_until_elapsed(&f->cluster, MSECS) + +/* Step the cluster until a leader is elected or #MAX_MSECS have elapsed. */ +#define CLUSTER_STEP_UNTIL_HAS_LEADER(MAX_MSECS) \ + { \ + bool done; \ + done = raft_fixture_step_until_has_leader(&f->cluster, MAX_MSECS); \ + munit_assert_true(done); \ + munit_assert_true(CLUSTER_HAS_LEADER); \ + } + +/* Step the cluster until there's no leader or #MAX_MSECS have elapsed. */ +#define CLUSTER_STEP_UNTIL_HAS_NO_LEADER(MAX_MSECS) \ + { \ + bool done; \ + done = raft_fixture_step_until_has_no_leader(&f->cluster, MAX_MSECS); \ + munit_assert_true(done); \ + munit_assert_false(CLUSTER_HAS_LEADER); \ + } + +/* Step the cluster until the given index was applied by the given server (or + * all if N) or #MAX_MSECS have elapsed. */ +#define CLUSTER_STEP_UNTIL_APPLIED(I, INDEX, MAX_MSECS) \ + { \ + bool done; \ + done = \ + raft_fixture_step_until_applied(&f->cluster, I, INDEX, MAX_MSECS); \ + munit_assert_true(done); \ + } + +/* Step the cluster until the state of the server with the given index matches + * the given value, or #MAX_MSECS have elapsed. */ +#define CLUSTER_STEP_UNTIL_STATE_IS(I, STATE, MAX_MSECS) \ + { \ + bool done; \ + done = raft_fixture_step_until_state_is(&f->cluster, I, STATE, \ + MAX_MSECS); \ + munit_assert_true(done); \ + } + +/* Step the cluster until the term of the server with the given index matches + * the given value, or #MAX_MSECS have elapsed. */ +#define CLUSTER_STEP_UNTIL_TERM_IS(I, TERM, MAX_MSECS) \ + { \ + bool done; \ + done = \ + raft_fixture_step_until_term_is(&f->cluster, I, TERM, MAX_MSECS); \ + munit_assert_true(done); \ + } + +/* Step the cluster until server I has voted for server J, or #MAX_MSECS have + * elapsed. */ +#define CLUSTER_STEP_UNTIL_VOTED_FOR(I, J, MAX_MSECS) \ + { \ + bool done; \ + done = \ + raft_fixture_step_until_voted_for(&f->cluster, I, J, MAX_MSECS); \ + munit_assert_true(done); \ + } + +/* Step the cluster until all messages from server I to server J have been + * delivered, or #MAX_MSECS elapse. */ +#define CLUSTER_STEP_UNTIL_DELIVERED(I, J, MAX_MSECS) \ + { \ + bool done; \ + done = \ + raft_fixture_step_until_delivered(&f->cluster, I, J, MAX_MSECS); \ + munit_assert_true(done); \ + } + +/* Request to apply an FSM command to add the given value to x. */ +#define CLUSTER_APPLY_ADD_X(I, REQ, VALUE, CB) \ + { \ + struct raft_buffer buf_; \ + struct raft *raft_; \ + int rv_; \ + FsmEncodeAddX(VALUE, &buf_); \ + raft_ = raft_fixture_get(&f->cluster, I); \ + rv_ = raft_apply(raft_, REQ, &buf_, 1, CB); \ + munit_assert_int(rv_, ==, 0); \ + } + +/* Kill the I'th server. */ +#define CLUSTER_KILL(I) raft_fixture_kill(&f->cluster, I); + +/* Revive the I'th server */ +#define CLUSTER_REVIVE(I) raft_fixture_revive(&f->cluster, I); + +/* Kill the leader. */ +#define CLUSTER_KILL_LEADER CLUSTER_KILL(CLUSTER_LEADER) + +/* Kill a majority of servers, except the leader (if there is one). */ +#define CLUSTER_KILL_MAJORITY \ + { \ + size_t i2; \ + size_t n; \ + for (i2 = 0, n = 0; n < (CLUSTER_N / 2) + 1; i2++) { \ + if (i2 == CLUSTER_LEADER) { \ + continue; \ + } \ + CLUSTER_KILL(i2) \ + n++; \ + } \ + } + +/* Grow the cluster adding one server. */ +#define CLUSTER_GROW \ + { \ + int rv_; \ + FsmInit(&f->fsms[CLUSTER_N], 2); \ + rv_ = raft_fixture_grow(&f->cluster, &f->fsms[CLUSTER_N]); \ + munit_assert_int(rv_, ==, 0); \ + } + +/* Add a new pristine server to the cluster, connected to all others. Then + * submit a request to add it to the configuration as an idle server. */ +#define CLUSTER_ADD(REQ) \ + { \ + int rc; \ + struct raft *new_raft; \ + CLUSTER_GROW; \ + rc = raft_start(CLUSTER_RAFT(CLUSTER_N - 1)); \ + munit_assert_int(rc, ==, 0); \ + new_raft = CLUSTER_RAFT(CLUSTER_N - 1); \ + rc = raft_add(CLUSTER_RAFT(CLUSTER_LEADER), REQ, new_raft->id, \ + new_raft->address, NULL); \ + munit_assert_int(rc, ==, 0); \ + } + +/* Assign the given role to the server that was added last. */ +#define CLUSTER_ASSIGN(REQ, ROLE) \ + do { \ + unsigned _id; \ + int _rv; \ + _id = CLUSTER_N; /* Last server that was added. */ \ + _rv = raft_assign(CLUSTER_RAFT(CLUSTER_LEADER), REQ, _id, ROLE, NULL); \ + munit_assert_int(_rv, ==, 0); \ + } while (0) + +/* Ensure that the cluster can make progress from the current state. + * + * - If no leader is present, wait for one to be elected. + * - Submit a request to apply a new FSM command and wait for it to complete. */ +#define CLUSTER_MAKE_PROGRESS \ + { \ + struct raft_apply *req_ = munit_malloc(sizeof *req_); \ + if (!(CLUSTER_HAS_LEADER)) { \ + CLUSTER_STEP_UNTIL_HAS_LEADER(10000); \ + } \ + CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req_, 1, NULL); \ + CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, req_->index, 3000); \ + free(req_); \ + } + +/* Elect the I'th server. */ +#define CLUSTER_ELECT(I) raft_fixture_elect(&f->cluster, I) + +/* Start to elect the I'th server. */ +#define CLUSTER_START_ELECT(I) raft_fixture_start_elect(&f->cluster, I) + +/* Depose the current leader */ +#define CLUSTER_DEPOSE raft_fixture_depose(&f->cluster) + +/* Disconnect I from J. */ +#define CLUSTER_DISCONNECT(I, J) raft_fixture_disconnect(&f->cluster, I, J) + +/* Reconnect I to J. */ +#define CLUSTER_RECONNECT(I, J) raft_fixture_reconnect(&f->cluster, I, J) + +/* Saturate the connection from I to J. */ +#define CLUSTER_SATURATE(I, J) raft_fixture_saturate(&f->cluster, I, J) + +/* Saturate the connection from I to J and from J to I, in both directions. */ +#define CLUSTER_SATURATE_BOTHWAYS(I, J) \ + CLUSTER_SATURATE(I, J); \ + CLUSTER_SATURATE(J, I) + +/* Desaturate the connection between I and J, making messages flow again. */ +#define CLUSTER_DESATURATE(I, J) raft_fixture_desaturate(&f->cluster, I, J) + +/* Reconnect two servers. */ +#define CLUSTER_DESATURATE_BOTHWAYS(I, J) \ + CLUSTER_DESATURATE(I, J); \ + CLUSTER_DESATURATE(J, I) + +/* Set the network latency of outgoing messages of server I. */ +#define CLUSTER_SET_NETWORK_LATENCY(I, MSECS) \ + raft_fixture_set_network_latency(&f->cluster, I, MSECS) + +/* Set the disk I/O latency of server I. */ +#define CLUSTER_SET_DISK_LATENCY(I, MSECS) \ + raft_fixture_set_disk_latency(&f->cluster, I, MSECS) + +/* Set the term persisted on the I'th server. This must be called before + * starting the cluster. */ +#define CLUSTER_SET_TERM(I, TERM) raft_fixture_set_term(&f->cluster, I, TERM) + +/* Set the snapshot persisted on the I'th server. This must be called before + * starting the cluster. */ +#define CLUSTER_SET_SNAPSHOT(I, LAST_INDEX, LAST_TERM, CONF_INDEX, X, Y) \ + { \ + struct raft_configuration configuration_; \ + struct raft_snapshot *snapshot_; \ + CLUSTER_CONFIGURATION(&configuration_); \ + CREATE_SNAPSHOT(snapshot_, LAST_INDEX, LAST_TERM, configuration_, \ + CONF_INDEX, X, Y); \ + raft_fixture_set_snapshot(&f->cluster, I, snapshot_); \ + } + +/* Add a persisted entry to the I'th server. This must be called before + * starting the cluster. */ +#define CLUSTER_ADD_ENTRY(I, ENTRY) \ + raft_fixture_add_entry(&f->cluster, I, ENTRY) + +/* Add an entry to the ones persisted on the I'th server. This must be called + * before starting the cluster. */ +#define CLUSTER_ADD_ENTRY(I, ENTRY) \ + raft_fixture_add_entry(&f->cluster, I, ENTRY) + +/* Return the number of messages sent by the given server. */ +#define CLUSTER_N_SEND(I, TYPE) raft_fixture_n_send(&f->cluster, I, TYPE) + +/* Return the number of messages sent by the given server. */ +#define CLUSTER_N_RECV(I, TYPE) raft_fixture_n_recv(&f->cluster, I, TYPE) + +/* Set a fixture hook that randomizes election timeouts, disk latency and + * network latency. */ +#define CLUSTER_RANDOMIZE \ + cluster_randomize_init(&f->cluster); \ + raft_fixture_hook(&f->cluster, cluster_randomize) + +void cluster_randomize_init(struct raft_fixture *f); +void cluster_randomize(struct raft_fixture *f, + struct raft_fixture_event *event); + +#endif /* TEST_CLUSTER_H */ diff --git a/test/raft/lib/dir.c b/test/raft/lib/dir.c new file mode 100644 index 000000000..a2c8f1d36 --- /dev/null +++ b/test/raft/lib/dir.c @@ -0,0 +1,423 @@ +#include "dir.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SEP "/" +#define TEMPLATE "raft-test-XXXXXX" + +#define TEST_DIR_TEMPLATE "./tmp/%s/raft-test-XXXXXX" + +static char *dirAll[] = {"tmpfs", "ext4", "btrfs", "xfs", "zfs", NULL}; + +static char *dirTmpfs[] = {"tmpfs", NULL}; + +static char *dirAio[] = {"btrfs", "ext4", "xfs", NULL}; + +static char *dirNoAio[] = {"tmpfs", "zfs", NULL}; + +MunitParameterEnum DirTmpfsParams[] = { + {DIR_FS_PARAM, dirTmpfs}, + {NULL, NULL}, +}; + +MunitParameterEnum DirAllParams[] = { + {DIR_FS_PARAM, dirAll}, + {NULL, NULL}, +}; + +MunitParameterEnum DirAioParams[] = { + {DIR_FS_PARAM, dirAio}, + {NULL, NULL}, +}; + +MunitParameterEnum DirNoAioParams[] = { + {DIR_FS_PARAM, dirNoAio}, + {NULL, NULL}, +}; + +/* Create a temporary directory in the given parent directory. */ +static char *dirMakeTemp(const char *parent) +{ + char *dir; + if (parent == NULL) { + return NULL; + } + dir = munit_malloc(strlen(parent) + strlen(SEP) + strlen(TEMPLATE) + 1); + sprintf(dir, "%s%s%s", parent, SEP, TEMPLATE); + if (mkdtemp(dir) == NULL) { + munit_error(strerror(errno)); + } + return dir; +} + +void *DirSetUp(MUNIT_UNUSED const MunitParameter params[], + MUNIT_UNUSED void *user_data) +{ + const char *fs = munit_parameters_get(params, DIR_FS_PARAM); + if (fs == NULL) { + return dirMakeTemp("/tmp"); + } else if (strcmp(fs, "tmpfs") == 0) { + return DirTmpfsSetUp(params, user_data); + } else if (strcmp(fs, "ext4") == 0) { + return DirExt4SetUp(params, user_data); + } else if (strcmp(fs, "btrfs") == 0) { + return DirBtrfsSetUp(params, user_data); + } else if (strcmp(fs, "zfs") == 0) { + return DirZfsSetUp(params, user_data); + } else if (strcmp(fs, "xfs") == 0) { + return DirXfsSetUp(params, user_data); + } + munit_errorf("Unsupported file system %s", fs); + return NULL; +} + +void *DirTmpfsSetUp(MUNIT_UNUSED const MunitParameter params[], + MUNIT_UNUSED void *user_data) +{ + return dirMakeTemp(getenv("RAFT_TMP_TMPFS")); +} + +void *DirExt4SetUp(MUNIT_UNUSED const MunitParameter params[], + MUNIT_UNUSED void *user_data) +{ + return dirMakeTemp(getenv("RAFT_TMP_EXT4")); +} + +void *DirBtrfsSetUp(MUNIT_UNUSED const MunitParameter params[], + MUNIT_UNUSED void *user_data) +{ + return dirMakeTemp(getenv("RAFT_TMP_BTRFS")); +} + +void *DirZfsSetUp(MUNIT_UNUSED const MunitParameter params[], + MUNIT_UNUSED void *user_data) +{ + return dirMakeTemp(getenv("RAFT_TMP_ZFS")); +} + +void *DirXfsSetUp(MUNIT_UNUSED const MunitParameter params[], + MUNIT_UNUSED void *user_data) +{ + return dirMakeTemp(getenv("RAFT_TMP_XFS")); +} + +/* Wrapper around remove(), compatible with ntfw. */ +static int dirRemoveFn(const char *path, + MUNIT_UNUSED const struct stat *sbuf, + MUNIT_UNUSED int type, + MUNIT_UNUSED struct FTW *ftwb) +{ + return remove(path); +} + +static void dirRemove(char *dir) +{ + int rv; + rv = chmod(dir, 0755); + munit_assert_int(rv, ==, 0); + + rv = nftw(dir, dirRemoveFn, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS); + munit_assert_int(rv, ==, 0); +} + +static bool dirExists(const char *dir) +{ + struct stat sb; + int rv; + + rv = stat(dir, &sb); + if (rv == -1) { + munit_assert_int(errno, ==, ENOENT); + return false; + } + + return true; +} + +void DirTearDown(void *data) +{ + char *dir = data; + if (dir == NULL) { + return; + } + if (dirExists(dir)) { + dirRemove(dir); + } + free(dir); +} + +/* Join the given @dir and @filename into @path. */ +static void joinPath(const char *dir, const char *filename, char *path) +{ + strcpy(path, dir); + strcat(path, "/"); + strcat(path, filename); +} + +void DirWriteFile(const char *dir, + const char *filename, + const void *buf, + const size_t n) +{ + char path[256]; + int fd; + int rv; + + joinPath(dir, filename, path); + + fd = open(path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); + munit_assert_int(fd, !=, -1); + + rv = write(fd, buf, n); + munit_assert_int(rv, ==, n); + + close(fd); +} + +void DirWriteFileWithZeros(const char *dir, + const char *filename, + const size_t n) +{ + void *buf = munit_malloc(n); + + DirWriteFile(dir, filename, buf, n); + + free(buf); +} + +void DirOverwriteFile(const char *dir, + const char *filename, + const void *buf, + const size_t n, + const off_t whence) +{ + char path[256]; + int fd; + int rv; + off_t size; + + joinPath(dir, filename, path); + + fd = open(path, O_RDWR, S_IRUSR | S_IWUSR); + + munit_assert_int(fd, !=, -1); + + /* Get the size of the file */ + size = lseek(fd, 0, SEEK_END); + + if (whence == 0) { + munit_assert_int(size, >=, n); + lseek(fd, 0, SEEK_SET); + } else if (whence > 0) { + munit_assert_int(whence, <=, size); + munit_assert_int(size - whence, >=, n); + lseek(fd, whence, SEEK_SET); + } else { + munit_assert_int(-whence, <=, size); + munit_assert_int(-whence, >=, n); + lseek(fd, whence, SEEK_END); + } + + rv = write(fd, buf, n); + munit_assert_int(rv, ==, n); + + close(fd); +} + +void DirTruncateFile(const char *dir, const char *filename, const size_t n) +{ + char path[256]; + int fd; + int rv; + + joinPath(dir, filename, path); + + fd = open(path, O_RDWR, S_IRUSR | S_IWUSR); + munit_assert_int(fd, !=, -1); + + rv = ftruncate(fd, n); + munit_assert_int(rv, ==, 0); + + rv = close(fd); + munit_assert_int(rv, ==, 0); +} + +void DirGrowFile(const char *dir, const char *filename, const size_t n) +{ + char path[256]; + int fd; + struct stat sb; + void *buf; + size_t size; + int rv; + + joinPath(dir, filename, path); + + fd = open(path, O_RDWR, S_IRUSR | S_IWUSR); + munit_assert_int(fd, !=, -1); + + rv = fstat(fd, &sb); + munit_assert_int(rv, ==, 0); + munit_assert_int(sb.st_size, <=, n); + + /* Fill with zeros. */ + lseek(fd, sb.st_size, SEEK_SET); + size = n - sb.st_size; + buf = munit_malloc(size); + rv = write(fd, buf, size); + munit_assert_int(rv, ==, size); + free(buf); + + rv = close(fd); + munit_assert_int(rv, ==, 0); +} + +void DirRenameFile(const char *dir, + const char *filename1, + const char *filename2) +{ + char path1[256]; + char path2[256]; + int rv; + + joinPath(dir, filename1, path1); + joinPath(dir, filename2, path2); + + rv = rename(path1, path2); + munit_assert_int(rv, ==, 0); +} + +void DirRemoveFile(const char *dir, const char *filename) +{ + char path[256]; + int rv; + + joinPath(dir, filename, path); + rv = unlink(path); + munit_assert_int(rv, ==, 0); +} + +void DirReadFile(const char *dir, + const char *filename, + void *buf, + const size_t n) +{ + char path[256]; + int fd; + int rv; + + joinPath(dir, filename, path); + + fd = open(path, O_RDONLY); + if (fd == -1) { + munit_logf(MUNIT_LOG_ERROR, "read file '%s': %s", path, + strerror(errno)); + } + + rv = read(fd, buf, n); + munit_assert_int(rv, ==, n); + + close(fd); +} + +void DirMakeUnexecutable(const char *dir) +{ + int rv; + + rv = chmod(dir, 0); + munit_assert_int(rv, ==, 0); +} + +void DirMakeUnwritable(const char *dir) +{ + int rv; + + rv = chmod(dir, 0500); + munit_assert_int(rv, ==, 0); +} + +void DirMakeFileUnreadable(const char *dir, const char *filename) +{ + char path[256]; + int rv; + + joinPath(dir, filename, path); + + rv = chmod(path, 0); + munit_assert_int(rv, ==, 0); +} + +bool DirHasFile(const char *dir, const char *filename) +{ + char path[256]; + int fd; + + joinPath(dir, filename, path); + + fd = open(path, O_RDONLY); + if (fd == -1) { + munit_assert_true(errno == ENOENT || errno == EACCES); + return false; + } + + close(fd); + + return true; +} + +void DirFill(const char *dir, const size_t n) +{ + char path[256]; + const char *filename = ".fill"; + struct statvfs fs; + size_t size; + int fd; + int rv; + + rv = statvfs(dir, &fs); + munit_assert_int(rv, ==, 0); + + size = fs.f_bsize * fs.f_bavail; + + if (n > 0) { + munit_assert_int(size, >=, n); + } + + joinPath(dir, filename, path); + + fd = open(path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); + munit_assert_int(fd, !=, -1); + + rv = posix_fallocate(fd, 0, size - n); + munit_assert_int(rv, ==, 0); + + /* If n is zero, make sure any further write fails with ENOSPC */ + if (n == 0) { + char buf[4096]; + int i; + + rv = lseek(fd, 0, SEEK_END); + munit_assert_int(rv, !=, -1); + + for (i = 0; i < 40; i++) { + rv = write(fd, buf, sizeof buf); + if (rv < 0) { + break; + } + } + + munit_assert_int(rv, ==, -1); + munit_assert_int(errno, ==, ENOSPC); + } + + close(fd); +} diff --git a/test/raft/lib/dir.h b/test/raft/lib/dir.h new file mode 100644 index 000000000..7980e6c1b --- /dev/null +++ b/test/raft/lib/dir.h @@ -0,0 +1,142 @@ +/* Test directory utilities. + * + * This module sports helpers to create temporary directories backed by various + * file systems, read/write files in them, check for the presence of files + * etc. */ + +#ifndef TEST_DIR_H +#define TEST_DIR_H + +#include + +#include "munit.h" + +/* Munit parameter defining the file system type backing the temporary directory + * created by test_dir_setup(). + * + * The various file systems must have been previously setup with the fs.sh + * script. */ +#define DIR_FS_PARAM "dir-fs" + +#define FIXTURE_DIR char *dir +#define SET_UP_DIR \ + f->dir = DirSetUp(params, user_data); \ + if (f->dir == NULL) { /* Fs not available, test must be skipped. */ \ + free(f); \ + return NULL; \ + } +#define TEAR_DOWN_DIR DirTearDown(f->dir) + +/* Contain a single DIR_FS_PARAM parameter set to all supported file system + * types. */ +extern MunitParameterEnum DirAllParams[]; + +/* Contain a single DIR_FS_PARAM parameter set to tmpfs. */ +extern MunitParameterEnum DirTmpfsParams[]; + +/* Contain a single DIR_FS_PARAM parameter set to all file systems with + * proper AIO support (i.e. NOWAIT works). */ +extern MunitParameterEnum DirAioParams[]; + +/* Contain a single DIR_FS_PARAM parameter set to all file systems without + * proper AIO support (i.e. NOWAIT does not work). */ +extern MunitParameterEnum DirNoAioParams[]; + +/* Create a temporary test directory. + * + * Return a pointer the path of the created directory. */ +void *DirSetUp(const MunitParameter params[], void *user_data); + +/* Create a temporary test directory backed by tmpfs. + * + * Return a pointer the path of the created directory, or NULL if no tmpfs file + * system is available. */ +void *DirTmpfsSetUp(const MunitParameter params[], void *user_data); + +/* Create a temporary test directory backed by ext4. + * + * Return a pointer the path of the created directory, or NULL if no ext4 file + * system is available. */ +void *DirExt4SetUp(const MunitParameter params[], void *user_data); + +/* Create a temporary test directory backed by btrfs. + * + * Return a pointer the path of the created directory, or NULL if no btrfs file + * system is available. */ +void *DirBtrfsSetUp(const MunitParameter params[], void *user_data); + +/* Create a temporary test directory backed by zfs. + * + * Return a pointer the path of the created directory, or NULL if no zfs file + * system is available. */ +void *DirZfsSetUp(const MunitParameter params[], void *user_data); + +/* Create a temporary test directory backed by xfs. + * + * Return a pointer the path of the created directory, or NULL if no xfs file + * system is available. */ +void *DirXfsSetUp(const MunitParameter params[], void *user_data); + +/* Recursively remove a temporary directory. */ +void DirTearDown(void *data); + +/* Write the given @buf to the given @filename in the given @dir. */ +void DirWriteFile(const char *dir, + const char *filename, + const void *buf, + const size_t n); + +/* Write the given @filename and fill it with zeros. */ +void DirWriteFileWithZeros(const char *dir, + const char *filename, + const size_t n); + +/* Overwrite @n bytes of the given file with the given @buf data. + * + * If @whence is zero, overwrite the first @n bytes of the file. If @whence is + * positive overwrite the @n bytes starting at offset @whence. If @whence is + * negative overwrite @n bytes starting at @whence bytes from the end of the + * file. */ +void DirOverwriteFile(const char *dir, + const char *filename, + const void *buf, + const size_t n, + const off_t whence); + +/* Truncate the given file, leaving only the first @n bytes. */ +void DirTruncateFile(const char *dir, const char *filename, const size_t n); + +/* Grow the given file to the given size, filling the new bytes with zeros. */ +void DirGrowFile(const char *dir, const char *filename, const size_t n); + +/* Rename a file in the given directory from filename1 to filename2. */ +void DirRenameFile(const char *dir, + const char *filename1, + const char *filename2); + +/* Remove a file. */ +void DirRemoveFile(const char *dir, const char *filename); + +/* Read into @buf the content of the given @filename in the given @dir. */ +void DirReadFile(const char *dir, + const char *filename, + void *buf, + const size_t n); + +/* Make the given directory not executable, so files can't be open. */ +void DirMakeUnexecutable(const char *dir); + +/* Make the given directory not writable. */ +void DirMakeUnwritable(const char *dir); + +/* Make the given file not readable. */ +void DirMakeFileUnreadable(const char *dir, const char *filename); + +/* Check if the given directory has the given file. */ +bool DirHasFile(const char *dir, const char *filename); + +/* Fill the underlying file system of the given dir, leaving only n bytes free. + */ +void DirFill(const char *dir, const size_t n); + +#endif /* TEST_DIR_H */ diff --git a/test/raft/lib/fault.c b/test/raft/lib/fault.c new file mode 100644 index 000000000..197c3adc1 --- /dev/null +++ b/test/raft/lib/fault.c @@ -0,0 +1,69 @@ +#include "fault.h" + +#include "munit.h" + +void FaultInit(struct Fault *f) +{ + f->countdown = -1; + f->n = -1; + f->paused = false; +} + +bool FaultTick(struct Fault *f) +{ + if (MUNIT_UNLIKELY(f->paused)) { + return false; + } + + /* If the initial delay parameter was set to -1, then never fail. This is + * the most common case. */ + if (MUNIT_LIKELY(f->countdown < 0)) { + return false; + } + + /* If we did not yet reach 'delay' ticks, then just decrease the countdown. + */ + if (f->countdown > 0) { + f->countdown--; + return false; + } + + munit_assert_int(f->countdown, ==, 0); + + /* We reached 'delay' ticks, let's see how many times we have to trigger the + * fault, if any. */ + + if (f->n < 0) { + /* Trigger the fault forever. */ + return true; + } + + if (f->n > 0) { + /* Trigger the fault at least this time. */ + f->n--; + return true; + } + + munit_assert_int(f->n, ==, 0); + + /* We reached 'repeat' ticks, let's stop triggering the fault. */ + f->countdown--; + + return false; +} + +void FaultConfig(struct Fault *f, int delay, int repeat) +{ + f->countdown = delay; + f->n = repeat; +} + +void FaultPause(struct Fault *f) +{ + f->paused = true; +} + +void FaultResume(struct Fault *f) +{ + f->paused = false; +} diff --git a/test/raft/lib/fault.h b/test/raft/lib/fault.h new file mode 100644 index 000000000..056469391 --- /dev/null +++ b/test/raft/lib/fault.h @@ -0,0 +1,32 @@ +/* Helper for test components supporting fault injection. */ + +#ifndef TEST_FAULT_H +#define TEST_FAULT_H + +#include + +/* Information about a fault that should occur in a component. */ +struct Fault +{ + int countdown; /* Trigger the fault when this counter gets to zero. */ + int n; /* Repeat the fault this many times. Default is -1. */ + bool paused; /* Pause fault triggering. */ +}; + +/* Initialize a fault. */ +void FaultInit(struct Fault *f); + +/* Advance the counters of the fault. Return true if the fault should be + * triggered, false otherwise. */ +bool FaultTick(struct Fault *f); + +/* Configure the fault with the given values. */ +void FaultConfig(struct Fault *f, int delay, int repeat); + +/* Pause triggering configured faults. */ +void FaultPause(struct Fault *f); + +/* Resume triggering configured faults. */ +void FaultResume(struct Fault *f); + +#endif /* TESTFAULT_H */ diff --git a/test/raft/lib/fs.sh b/test/raft/lib/fs.sh new file mode 100755 index 000000000..638eeda91 --- /dev/null +++ b/test/raft/lib/fs.sh @@ -0,0 +1,118 @@ +#!/bin/sh -e + +# Setup loopback disk devices to test the raft I/O implementation against +# various file systems. + +usage() { + echo "usage: $0 setup|teardown [types]" +} + +if [ "${#}" -lt 1 ]; then + usage + exit 1 +fi + +cmd="${1}" +shift + +types="tmpfs" + +# Check if loop devices are available, we might be running inside an +# unprivileged container +if sudo losetup -f > /dev/null 2>&1; then + types="$types ext4" + + if [ "$(which mkfs.btrfs)" != "" ]; then + types="$types btrfs" + fi + + if [ "$(which mkfs.xfs)" != "" ]; then + types="$types xfs" + fi + + if [ "$(which zfs)" != "" ]; then + types="$types zfs" + fi + + if [ "${#}" -gt 0 ]; then + types="${@}" + fi + +fi + +if [ "${cmd}" = "detect" ]; then + vars="" + for type in $types; do + vars="${vars}RAFT_TMP_$(echo ${type} | tr [a-z] [A-Z])=./tmp/${type} " + done + echo $vars + exit 0 +fi + +if [ "${cmd}" = "setup" ]; then + mkdir ./tmp + + for type in $types; do + echo -n "Creating $type loop device mount..." + + # Create the fs mount point + mkdir "./tmp/${type}" + + if [ "$type" = "tmpfs" ]; then + # For tmpfs we don't need a loopback disk device. + sudo mount -t tmpfs -o size=32m tmpfs ./tmp/tmpfs + else + # Create a loopback disk device + dd if=/dev/zero of="./tmp/.${type}" bs=4096 count=28672 > /dev/null 2>&1 + loop=$(sudo losetup -f) + sudo losetup "${loop}" "./tmp/.${type}" + + # Initialize the file system + if [ "$type" = "zfs" ]; then + sudo zpool create raft "${loop}" + sudo zfs create -o mountpoint=$(pwd)/tmp/zfs raft/zfs + else + sudo mkfs.${type} "${loop}" > /dev/null 2>&1 + sudo mount "${loop}" "./tmp/${type}" + fi + fi + + sudo chown $USER "./tmp/${type}" + + echo " done" + done + + exit 0 +fi + +if [ "${cmd}" = "teardown" ]; then + + for type in $types; do + echo -n "Deleting $type loop device mount..." + + sudo umount "./tmp/${type}" + rm -rf "./tmp/${type}" + + if [ "$type" != "tmpfs" ]; then + # For zfs we need to destroy the pool + if [ "$type" = "zfs" ]; then + sudo zpool destroy raft + fi + + # For regular file systems, remove the loopback disk device. + loop=$(sudo losetup -a | grep ".${type}" | cut -f 1 -d :) + sudo losetup -d "${loop}" + rm "./tmp/.${type}" + fi + + echo " done" + done + + rmdir ./tmp + + exit 0 +fi + +usage + +exit 1 diff --git a/test/raft/lib/fsm.c b/test/raft/lib/fsm.c new file mode 100644 index 000000000..78b6ff90e --- /dev/null +++ b/test/raft/lib/fsm.c @@ -0,0 +1,293 @@ +#include "fsm.h" + +#include "../../../src/raft/byte.h" +#include "munit.h" + +/* In-memory implementation of the raft_fsm interface. */ +struct fsm +{ + int x; + int y; + int lock; + void *data; +}; + +/* Command codes */ +enum { SET_X = 1, SET_Y, ADD_X, ADD_Y }; + +static int fsmApply(struct raft_fsm *fsm, + const struct raft_buffer *buf, + void **result) +{ + struct fsm *f = fsm->data; + const void *cursor = buf->base; + unsigned command; + int value; + + if (buf->len != 16) { + return -1; + } + + command = (unsigned)byteGet64(&cursor); + value = (int)byteGet64(&cursor); + + switch (command) { + case SET_X: + f->x = value; + break; + case SET_Y: + f->y = value; + break; + case ADD_X: + f->x += value; + break; + case ADD_Y: + f->y += value; + break; + default: + return -1; + } + + *result = NULL; + + return 0; +} + +static int fsmRestore(struct raft_fsm *fsm, struct raft_buffer *buf) +{ + struct fsm *f = fsm->data; + const void *cursor = buf->base; + + munit_assert_int(buf->len, ==, sizeof(uint64_t) * 2); + + f->x = byteGet64(&cursor); + f->y = byteGet64(&cursor); + + raft_free(buf->base); + + return 0; +} + +static int fsmEncodeSnapshot(int x, + int y, + struct raft_buffer *bufs[], + unsigned *n_bufs) +{ + struct raft_buffer *buf; + void *cursor; + + *n_bufs = 1; + + *bufs = raft_malloc(sizeof **bufs); + if (*bufs == NULL) { + return RAFT_NOMEM; + } + + buf = &(*bufs)[0]; + buf->len = sizeof(uint64_t) * 2; + buf->base = raft_malloc(buf->len); + if (buf->base == NULL) { + return RAFT_NOMEM; + } + + cursor = (*bufs)[0].base; + + bytePut64(&cursor, x); + bytePut64(&cursor, y); + + return 0; +} + +/* For use with fsm->version 1 */ +static int fsmSnapshot_v1(struct raft_fsm *fsm, + struct raft_buffer *bufs[], + unsigned *n_bufs) +{ + struct fsm *f = fsm->data; + return fsmEncodeSnapshot(f->x, f->y, bufs, n_bufs); +} + +/* For use with fsmSnapshotFinalize and fsm->version >= 2 */ +static int fsmSnapshot_v2(struct raft_fsm *fsm, + struct raft_buffer *bufs[], + unsigned *n_bufs) +{ + struct fsm *f = fsm->data; + munit_assert_int(f->lock, ==, 0); + f->lock = 1; + f->data = raft_malloc(8); /* Detect proper cleanup in finalize */ + munit_assert_ptr_not_null(f->data); + return fsmEncodeSnapshot(f->x, f->y, bufs, n_bufs); +} + +static int fsmSnapshotInitialize(struct raft_fsm *fsm, + struct raft_buffer *bufs[], + unsigned *n_bufs) +{ + (void)bufs; + (void)n_bufs; + struct fsm *f = fsm->data; + munit_assert_int(f->lock, ==, 0); + f->lock = 1; + munit_assert_ptr_null(f->data); + f->data = raft_malloc(8); /* Detect proper cleanup in finalize */ + munit_assert_ptr_not_null(f->data); + return 0; +} + +static int fsmSnapshotAsync(struct raft_fsm *fsm, + struct raft_buffer *bufs[], + unsigned *n_bufs) +{ + struct fsm *f = fsm->data; + return fsmEncodeSnapshot(f->x, f->y, bufs, n_bufs); +} + +static int fsmSnapshotFinalize(struct raft_fsm *fsm, + struct raft_buffer *bufs[], + unsigned *n_bufs) +{ + (void)bufs; + (void)n_bufs; + struct fsm *f = fsm->data; + if (*bufs != NULL) { + for (unsigned i = 0; i < *n_bufs; ++i) { + raft_free((*bufs)[i].base); + } + raft_free(*bufs); + } + *bufs = NULL; + *n_bufs = 0; + munit_assert_int(f->lock, ==, 1); + f->lock = 0; + munit_assert_ptr_not_null(f->data); + raft_free(f->data); + f->data = NULL; + return 0; +} + +void FsmInit(struct raft_fsm *fsm, int version) +{ + struct fsm *f = munit_malloc(sizeof *fsm); + memset(fsm, 'x', sizeof(*fsm)); /* Fill with garbage */ + + f->x = 0; + f->y = 0; + f->lock = 0; + f->data = NULL; + + fsm->version = version; + fsm->data = f; + fsm->apply = fsmApply; + fsm->snapshot = fsmSnapshot_v1; + fsm->restore = fsmRestore; + if (version > 1) { + fsm->snapshot = fsmSnapshot_v2; + fsm->snapshot_finalize = fsmSnapshotFinalize; + fsm->snapshot_async = NULL; + } +} + +void FsmInitAsync(struct raft_fsm *fsm, int version) +{ + munit_assert_int(version, >, 2); + struct fsm *f = munit_malloc(sizeof *fsm); + memset(fsm, 'x', sizeof(*fsm)); /* Fill with garbage */ + + f->x = 0; + f->y = 0; + f->lock = 0; + f->data = NULL; + + fsm->version = version; + fsm->data = f; + fsm->apply = fsmApply; + fsm->snapshot = fsmSnapshotInitialize; + fsm->snapshot_async = fsmSnapshotAsync; + fsm->snapshot_finalize = fsmSnapshotFinalize; + fsm->restore = fsmRestore; +} + +void FsmClose(struct raft_fsm *fsm) +{ + struct fsm *f = fsm->data; + free(f); +} + +void FsmEncodeSetX(const int value, struct raft_buffer *buf) +{ + void *cursor; + + buf->base = raft_malloc(16); + buf->len = 16; + + munit_assert_ptr_not_null(buf->base); + + cursor = buf->base; + bytePut64(&cursor, SET_X); + bytePut64(&cursor, value); +} + +void FsmEncodeAddX(const int value, struct raft_buffer *buf) +{ + void *cursor; + + buf->base = raft_malloc(16); + buf->len = 16; + + munit_assert_ptr_not_null(buf->base); + + cursor = buf->base; + bytePut64(&cursor, ADD_X); + bytePut64(&cursor, value); +} + +void FsmEncodeSetY(const int value, struct raft_buffer *buf) +{ + void *cursor; + + buf->base = raft_malloc(16); + buf->len = 16; + + munit_assert_ptr_not_null(buf->base); + + cursor = buf->base; + bytePut64(&cursor, SET_Y); + bytePut64(&cursor, value); +} + +void FsmEncodeAddY(const int value, struct raft_buffer *buf) +{ + void *cursor; + + buf->base = raft_malloc(16); + buf->len = 16; + + munit_assert_ptr_not_null(buf->base); + + cursor = buf->base; + bytePut64(&cursor, ADD_Y); + bytePut64(&cursor, value); +} + +void FsmEncodeSnapshot(int x, + int y, + struct raft_buffer *bufs[], + unsigned *n_bufs) +{ + int rc; + rc = fsmEncodeSnapshot(x, y, bufs, n_bufs); + munit_assert_int(rc, ==, 0); +} + +int FsmGetX(struct raft_fsm *fsm) +{ + struct fsm *f = fsm->data; + return f->x; +} + +int FsmGetY(struct raft_fsm *fsm) +{ + struct fsm *f = fsm->data; + return f->y; +} diff --git a/test/raft/lib/fsm.h b/test/raft/lib/fsm.h new file mode 100644 index 000000000..da82fa0f8 --- /dev/null +++ b/test/raft/lib/fsm.h @@ -0,0 +1,39 @@ +/* Test implementation of the raft_fsm interface, with fault injection. + * + * The test FSM supports only two commands: setting x and setting y. */ + +#ifndef TEST_FSM_H +#define TEST_FSM_H + +#include "../../../src/raft.h" + +void FsmInit(struct raft_fsm *fsm, int version); + +/* Same as FsmInit but with asynchronous snapshots */ +void FsmInitAsync(struct raft_fsm *fsm, int version); + +void FsmClose(struct raft_fsm *fsm); + +/* Encode a command to set x to the given value. */ +void FsmEncodeSetX(int value, struct raft_buffer *buf); + +/* Encode a command to add the given value to x. */ +void FsmEncodeAddX(int value, struct raft_buffer *buf); + +/* Encode a command to set y to the given value. */ +void FsmEncodeSetY(int value, struct raft_buffer *buf); + +/* Encode a command to add the given value to y. */ +void FsmEncodeAddY(int value, struct raft_buffer *buf); + +/* Encode a snapshot of an FSM with the given values for x and y. */ +void FsmEncodeSnapshot(int x, + int y, + struct raft_buffer *bufs[], + unsigned *n_bufs); + +/* Return the current value of x or y. */ +int FsmGetX(struct raft_fsm *fsm); +int FsmGetY(struct raft_fsm *fsm); + +#endif /* TEST_FSM_H */ diff --git a/test/raft/lib/heap.c b/test/raft/lib/heap.c new file mode 100644 index 000000000..77f187ea3 --- /dev/null +++ b/test/raft/lib/heap.c @@ -0,0 +1,134 @@ +#include "heap.h" + +#include + +#include "fault.h" +#include "munit.h" + +struct heap +{ + size_t alignment; /* Value of last aligned alloc */ + struct Fault fault; /* Fault trigger. */ +}; + +static void heapInit(struct heap *h) +{ + h->alignment = 0; + FaultInit(&h->fault); +} + +static void *heapMalloc(void *data, size_t size) +{ + struct heap *h = data; + if (FaultTick(&h->fault)) { + return NULL; + } + return munit_malloc(size); +} + +static void heapFree(void *data, void *ptr) +{ + (void)data; + free(ptr); +} + +static void *heapCalloc(void *data, size_t nmemb, size_t size) +{ + struct heap *h = data; + if (FaultTick(&h->fault)) { + return NULL; + } + return munit_calloc(nmemb, size); +} + +static void *heapRealloc(void *data, void *ptr, size_t size) +{ + struct heap *h = data; + + if (FaultTick(&h->fault)) { + return NULL; + } + + ptr = realloc(ptr, size); + + if (size == 0) { + munit_assert_ptr_null(ptr); + } else { + munit_assert_ptr_not_null(ptr); + } + + return ptr; +} + +static void *heapAlignedAlloc(void *data, size_t alignment, size_t size) +{ + struct heap *h = data; + void *p; + + if (FaultTick(&h->fault)) { + return NULL; + } + + p = aligned_alloc(alignment, size); + munit_assert_ptr_not_null(p); + + h->alignment = alignment; + + return p; +} + +static void heapAlignedFree(void *data, size_t alignment, void *ptr) +{ + struct heap *h = data; + munit_assert_ulong(alignment, ==, h->alignment); + heapFree(data, ptr); +} + +static int getIntParam(const MunitParameter params[], const char *name) +{ + const char *value = munit_parameters_get(params, name); + return value != NULL ? atoi(value) : 0; +} + +void HeapSetUp(const MunitParameter params[], struct raft_heap *h) +{ + struct heap *heap = munit_malloc(sizeof *heap); + int delay = getIntParam(params, TEST_HEAP_FAULT_DELAY); + int repeat = getIntParam(params, TEST_HEAP_FAULT_REPEAT); + + munit_assert_ptr_not_null(h); + + heapInit(heap); + + FaultConfig(&heap->fault, delay, repeat); + + h->data = heap; + h->malloc = heapMalloc; + h->free = heapFree; + h->calloc = heapCalloc; + h->realloc = heapRealloc; + h->aligned_alloc = heapAlignedAlloc; + h->aligned_free = heapAlignedFree; + + raft_heap_set(h); + FaultPause(&heap->fault); +} + +void HeapTearDown(struct raft_heap *h) +{ + struct heap *heap = h->data; + free(heap); + raft_heap_set_default(); +} + +void HeapFaultConfig(struct raft_heap *h, int delay, int repeat) +{ + struct heap *heap = h->data; + FaultConfig(&heap->fault, delay, repeat); +} + +void HeapFaultEnable(struct raft_heap *h) +{ + struct heap *heap = h->data; + FaultResume(&heap->fault); +} diff --git a/test/raft/lib/heap.h b/test/raft/lib/heap.h new file mode 100644 index 000000000..33f79f1e2 --- /dev/null +++ b/test/raft/lib/heap.h @@ -0,0 +1,33 @@ +/* Add support for fault injection and leak detection to stdlib's malloc() + * family. */ + +#ifndef TEST_HEAP_H +#define TEST_HEAP_H + +#include "../../../src/raft.h" +#include "munit.h" + +/* Munit parameter defining after how many API calls the test raft_heap + * implementation should start failing and return errors. The default is -1, + * meaning that no failure will ever occur. */ +#define TEST_HEAP_FAULT_DELAY "heap-fault-delay" + +/* Munit parameter defining how many consecutive times API calls against the + * test raft_heap implementation should keep failing after they started + * failing. This parameter has an effect only if 'store-fail-delay' is 0 or + * greater. The default is 1, and -1 means "keep failing forever". */ +#define TEST_HEAP_FAULT_REPEAT "heap-fault-repeat" + +/* Macro helpers. */ +#define FIXTURE_HEAP struct raft_heap heap +#define SET_UP_HEAP HeapSetUp(params, &f->heap) +#define TEAR_DOWN_HEAP HeapTearDown(&f->heap) +#define HEAP_FAULT_ENABLE HeapFaultEnable(&f->heap) + +void HeapSetUp(const MunitParameter params[], struct raft_heap *h); +void HeapTearDown(struct raft_heap *h); + +void HeapFaultConfig(struct raft_heap *h, int delay, int repeat); +void HeapFaultEnable(struct raft_heap *h); + +#endif /* TEST_HEAP_H */ diff --git a/test/raft/lib/loop.c b/test/raft/lib/loop.c new file mode 100644 index 000000000..6a63161a4 --- /dev/null +++ b/test/raft/lib/loop.c @@ -0,0 +1,7 @@ +#include "loop.h" + +void test_loop_walk_cb(uv_handle_t *handle, void *arg) +{ + (void)arg; + munit_logf(MUNIT_LOG_INFO, "handle %d", handle->type); +} diff --git a/test/raft/lib/loop.h b/test/raft/lib/loop.h new file mode 100644 index 000000000..03d3832fd --- /dev/null +++ b/test/raft/lib/loop.h @@ -0,0 +1,115 @@ +/* Add support for using the libuv loop in tests. */ + +#ifndef TEST_LOOP_H +#define TEST_LOOP_H + +#include + +#include "../../../src/raft.h" +#include "munit.h" + +/* Max n. of loop iterations ran by a single function call */ +#define LOOP_MAX_RUN 20 + +#define FIXTURE_LOOP struct uv_loop_s loop + +/* Older libuv versions might try to free() memory that was not allocated. */ +#if HAVE_DECL_UV_FS_O_CREAT +#define LOOP_REPLACE_ALLOCATOR \ + _rv = uv_replace_allocator(raft_malloc, raft_realloc, raft_calloc, \ + raft_free); \ + munit_assert_int(_rv, ==, 0) +#else +#define LOOP_REPLACE_ALLOCATOR +#endif + +#define SETUP_LOOP \ + { \ + int _rv; \ + LOOP_REPLACE_ALLOCATOR; \ + _rv = uv_loop_init(&f->loop); \ + munit_assert_int(_rv, ==, 0); \ + } + +#define TEAR_DOWN_LOOP \ + { \ + int rv_; \ + int alive_ = uv_loop_alive(&f->loop); \ + if (alive_ != 0) { \ + LOOP_STOP; \ + } \ + rv_ = uv_loop_close(&f->loop); \ + if (rv_ != 0) { \ + uv_walk(&f->loop, test_loop_walk_cb, NULL); \ + munit_errorf("uv_loop_close: %s (%d)", uv_strerror(rv_), rv_); \ + } \ + rv_ = uv_replace_allocator(malloc, realloc, calloc, free); \ + munit_assert_int(rv_, ==, 0); \ + } + +/* Run the loop until there are no pending active handles or the given amount of + * iterations is reached. */ +#define LOOP_RUN(N) \ + { \ + unsigned i__; \ + int rv__; \ + for (i__ = 0; i__ < N; i__++) { \ + rv__ = uv_run(&f->loop, UV_RUN_ONCE); \ + if (rv__ < 0) { \ + munit_errorf("uv_run: %s (%d)", uv_strerror(rv__), rv__); \ + } \ + if (rv__ == 0) { \ + break; \ + } \ + } \ + } + +/* Run the loop until the value stored through the given boolean pointer is + * true. + * + * If the loop exhausts all active handles or if #LOOP_MAX_RUN is reached, the + * test fails. */ +#define LOOP_RUN_UNTIL(CONDITION) \ + { \ + unsigned __i; \ + int __rv; \ + for (__i = 0; __i < LOOP_MAX_RUN; __i++) { \ + if (*(CONDITION)) { \ + break; \ + } \ + __rv = uv_run(&f->loop, UV_RUN_ONCE); \ + if (__rv < 0) { \ + munit_errorf("uv_run: %s (%d)", uv_strerror(__rv), __rv); \ + } \ + if (__rv == 0) { \ + if (*(CONDITION)) { \ + break; \ + } \ + munit_errorf("uv_run: stopped after %u iterations", __i + 1); \ + } \ + } \ + if (!*(CONDITION)) { \ + munit_errorf("uv_run: condition not met in %d iterations", \ + LOOP_MAX_RUN); \ + } \ + } + +/* Run the loop until there are no pending active handles. + * + * If there are still pending active handles after LOOP_MAX_RUN iterations, the + * test will fail. + * + * This is meant to be used in tear down functions. */ +#define LOOP_STOP \ + { \ + int alive__; \ + LOOP_RUN(LOOP_MAX_RUN); \ + alive__ = uv_loop_alive(&f->loop); \ + if (alive__ != 0) { \ + munit_error("loop has still pending active handles"); \ + } \ + } + +void test_loop_walk_cb(uv_handle_t *handle, void *arg); + +#endif /* TEST_LOOP_H */ diff --git a/test/raft/lib/macros.h b/test/raft/lib/macros.h new file mode 100644 index 000000000..9af9bd024 --- /dev/null +++ b/test/raft/lib/macros.h @@ -0,0 +1,13 @@ +/** + * Miscellaneous test macros. + */ + +#ifndef TEST_MACROS_H_ +#define TEST_MACROS_H_ + +#define GET_2ND_ARG(arg1, arg2, ...) arg2 +#define GET_3RD_ARG(arg1, arg2, arg3, ...) arg3 +#define GET_4TH_ARG(arg1, arg2, arg3, arg4, ...) arg4 +#define GET_5TH_ARG(arg1, arg2, arg3, arg4, arg5, ...) arg5 + +#endif /* TEST_MACROS_H_ */ diff --git a/test/raft/lib/munit.c b/test/raft/lib/munit.c new file mode 100644 index 000000000..1d496f4b5 --- /dev/null +++ b/test/raft/lib/munit.c @@ -0,0 +1,2077 @@ +/* Copyright (c) 2013-2018 Evan Nemerson + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*** Configuration ***/ + +/* This is just where the output from the test goes. It's really just + * meant to let you choose stdout or stderr, but if anyone really want + * to direct it to a file let me know, it would be fairly easy to + * support. */ +#if !defined(MUNIT_OUTPUT_FILE) +# define MUNIT_OUTPUT_FILE stdout +#endif + +/* This is a bit more useful; it tells µnit how to format the seconds in + * timed tests. If your tests run for longer you might want to reduce + * it, and if your computer is really fast and your tests are tiny you + * can increase it. */ +#if !defined(MUNIT_TEST_TIME_FORMAT) +# define MUNIT_TEST_TIME_FORMAT "0.8f" +#endif + +/* If you have long test names you might want to consider bumping + * this. The result information takes 43 characters. */ +#if !defined(MUNIT_TEST_NAME_LEN) +# define MUNIT_TEST_NAME_LEN 37 +#endif + +/* If you don't like the timing information, you can disable it by + * defining MUNIT_DISABLE_TIMING. */ +#if !defined(MUNIT_DISABLE_TIMING) +# define MUNIT_ENABLE_TIMING +#endif + +/*** End configuration ***/ + +#if defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE < 200809L) +# undef _POSIX_C_SOURCE +#endif +#if !defined(_POSIX_C_SOURCE) +# define _POSIX_C_SOURCE 200809L +#endif + +/* Solaris freaks out if you try to use a POSIX or SUS standard without + * the "right" C standard. */ +#if defined(_XOPEN_SOURCE) +# undef _XOPEN_SOURCE +#endif + +#if defined(__STDC_VERSION__) +# if __STDC_VERSION__ >= 201112L +# define _XOPEN_SOURCE 700 +# elif __STDC_VERSION__ >= 199901L +# define _XOPEN_SOURCE 600 +# endif +#endif + +/* Because, according to Microsoft, POSIX is deprecated. You've got + * to appreciate the chutzpah. */ +#if defined(_MSC_VER) && !defined(_CRT_NONSTDC_NO_DEPRECATE) +# define _CRT_NONSTDC_NO_DEPRECATE +#endif + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) +# include +#elif defined(_WIN32) +/* https://msdn.microsoft.com/en-us/library/tf4dy80a.aspx */ +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#if !defined(MUNIT_NO_NL_LANGINFO) && !defined(_WIN32) +#define MUNIT_NL_LANGINFO +#include +#include +#include +#endif + +#if !defined(_WIN32) +# include +# include +# include +#else +# include +# include +# include +# if !defined(STDERR_FILENO) +# define STDERR_FILENO _fileno(stderr) +# endif +#endif + +#include "munit.h" + +#define MUNIT_STRINGIFY(x) #x +#define MUNIT_XSTRINGIFY(x) MUNIT_STRINGIFY(x) + +#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_CC) || defined(__IBMCPP__) +# define MUNIT_THREAD_LOCAL __thread +#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201102L)) || defined(_Thread_local) +# define MUNIT_THREAD_LOCAL _Thread_local +#elif defined(_WIN32) +# define MUNIT_THREAD_LOCAL __declspec(thread) +#endif + +/* MSVC 12.0 will emit a warning at /W4 for code like 'do { ... } + * while (0)', or 'do { ... } while (true)'. I'm pretty sure nobody + * at Microsoft compiles with /W4. */ +#if defined(_MSC_VER) && (_MSC_VER <= 1800) +#pragma warning(disable: 4127) +#endif + +#if defined(_WIN32) || defined(__EMSCRIPTEN__) +# define MUNIT_NO_FORK +#endif + +#if defined(__EMSCRIPTEN__) +# define MUNIT_NO_BUFFER +#endif + +/*** Logging ***/ + +static MunitLogLevel munit_log_level_visible = MUNIT_LOG_INFO; +static MunitLogLevel munit_log_level_fatal = MUNIT_LOG_ERROR; + +#if defined(MUNIT_THREAD_LOCAL) +static MUNIT_THREAD_LOCAL bool munit_error_jmp_buf_valid = false; +static MUNIT_THREAD_LOCAL jmp_buf munit_error_jmp_buf; +#endif + +#if defined(MUNIT_THREAD_LOCAL) && defined(MUNIT_ALWAYS_TEAR_DOWN) +static MUNIT_THREAD_LOCAL bool munit_tear_down_jmp_buf_valid = false; +static MUNIT_THREAD_LOCAL jmp_buf munit_tear_down_jmp_buf; +#endif + +/* At certain warning levels, mingw will trigger warnings about + * suggesting the format attribute, which we've explicitly *not* set + * because it will then choke on our attempts to use the MS-specific + * I64 modifier for size_t (which we have to use since MSVC doesn't + * support the C99 z modifier). */ + +#if defined(__MINGW32__) || defined(__MINGW64__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wsuggest-attribute=format" +#endif + +MUNIT_PRINTF(5,0) +static void +munit_logf_exv(MunitLogLevel level, FILE* fp, const char* filename, int line, const char* format, va_list ap) { + if (level < munit_log_level_visible) + return; + + switch (level) { + case MUNIT_LOG_DEBUG: + fputs("Debug", fp); + break; + case MUNIT_LOG_INFO: + fputs("Info", fp); + break; + case MUNIT_LOG_WARNING: + fputs("Warning", fp); + break; + case MUNIT_LOG_ERROR: + fputs("Error", fp); + break; + default: + munit_logf_ex(MUNIT_LOG_ERROR, filename, line, "Invalid log level (%d)", level); + return; + } + + fputs(": ", fp); + if (filename != NULL) + fprintf(fp, "%s:%d: ", filename, line); + vfprintf(fp, format, ap); + fputc('\n', fp); +} + +MUNIT_PRINTF(3,4) +static void +munit_logf_internal(MunitLogLevel level, FILE* fp, const char* format, ...) { + va_list ap; + + va_start(ap, format); + munit_logf_exv(level, fp, NULL, 0, format, ap); + va_end(ap); +} + +static void +munit_log_internal(MunitLogLevel level, FILE* fp, const char* message) { + munit_logf_internal(level, fp, "%s", message); +} + +void +munit_logf_ex(MunitLogLevel level, const char* filename, int line, const char* format, ...) { + va_list ap; + + va_start(ap, format); + munit_logf_exv(level, stderr, filename, line, format, ap); + va_end(ap); + + if (level >= munit_log_level_fatal) { +#if defined(MUNIT_THREAD_LOCAL) + if (munit_error_jmp_buf_valid) + longjmp(munit_error_jmp_buf, 1); +#endif + abort(); + } +} + +void +munit_errorf_ex(const char* filename, int line, const char* format, ...) { + va_list ap; + + va_start(ap, format); + munit_logf_exv(MUNIT_LOG_ERROR, stderr, filename, line, format, ap); + va_end(ap); + +#if defined(MUNIT_THREAD_LOCAL) && defined(MUNIT_ALWAYS_TEAR_DOWN) + if (munit_tear_down_jmp_buf_valid) + longjmp(munit_tear_down_jmp_buf, 1); +#endif + +#if defined(MUNIT_THREAD_LOCAL) + if (munit_error_jmp_buf_valid) + longjmp(munit_error_jmp_buf, 1); +#endif + abort(); +} + +#if defined(__MINGW32__) || defined(__MINGW64__) +#pragma GCC diagnostic pop +#endif + +#if !defined(MUNIT_STRERROR_LEN) +# define MUNIT_STRERROR_LEN 80 +#endif + +static void +munit_log_errno(MunitLogLevel level, FILE* fp, const char* msg) { +#if defined(MUNIT_NO_STRERROR_R) || (defined(__MINGW32__) && !defined(MINGW_HAS_SECURE_API)) + munit_logf_internal(level, fp, "%s: %s (%d)", msg, strerror(errno), errno); +#else + char munit_error_str[MUNIT_STRERROR_LEN]; + munit_error_str[0] = '\0'; + +#if !defined(_WIN32) + strerror_r(errno, munit_error_str, MUNIT_STRERROR_LEN); +#else + strerror_s(munit_error_str, MUNIT_STRERROR_LEN, errno); +#endif + + munit_logf_internal(level, fp, "%s: %s (%d)", msg, munit_error_str, errno); +#endif +} + +/*** Memory allocation ***/ + +void* +munit_malloc_ex(const char* filename, int line, size_t size) { + void* ptr; + + if (size == 0) + return NULL; + + ptr = calloc(1, size); + if (MUNIT_UNLIKELY(ptr == NULL)) { + munit_logf_ex(MUNIT_LOG_ERROR, filename, line, "Failed to allocate %" MUNIT_SIZE_MODIFIER "u bytes.", size); + } + + return ptr; +} + +/*** Timer code ***/ + +#if defined(MUNIT_ENABLE_TIMING) + +#define psnip_uint64_t munit_uint64_t +#define psnip_uint32_t munit_uint32_t + +/* Code copied from portable-snippets + * . If you need to + * change something, please do it there so we can keep the code in + * sync. */ + +/* Clocks (v1) + * Portable Snippets - https://github.com/nemequ/portable-snippets + * Created by Evan Nemerson + * + * To the extent possible under law, the authors have waived all + * copyright and related or neighboring rights to this code. For + * details, see the Creative Commons Zero 1.0 Universal license at + * https://creativecommons.org/publicdomain/zero/1.0/ + */ + +#if !defined(PSNIP_CLOCK_H) +#define PSNIP_CLOCK_H + +#if !defined(psnip_uint64_t) +# include "../exact-int/exact-int.h" +#endif + +#if !defined(PSNIP_CLOCK_STATIC_INLINE) +# if defined(__GNUC__) +# define PSNIP_CLOCK__COMPILER_ATTRIBUTES __attribute__((__unused__)) +# else +# define PSNIP_CLOCK__COMPILER_ATTRIBUTES +# endif + +# define PSNIP_CLOCK__FUNCTION PSNIP_CLOCK__COMPILER_ATTRIBUTES static +#endif + +enum PsnipClockType { + /* This clock provides the current time, in units since 1970-01-01 + * 00:00:00 UTC not including leap seconds. In other words, UNIX + * time. Keep in mind that this clock doesn't account for leap + * seconds, and can go backwards (think NTP adjustments). */ + PSNIP_CLOCK_TYPE_WALL = 1, + /* The CPU time is a clock which increases only when the current + * process is active (i.e., it doesn't increment while blocking on + * I/O). */ + PSNIP_CLOCK_TYPE_CPU = 2, + /* Monotonic time is always running (unlike CPU time), but it only + ever moves forward unless you reboot the system. Things like NTP + adjustments have no effect on this clock. */ + PSNIP_CLOCK_TYPE_MONOTONIC = 3 +}; + +struct PsnipClockTimespec { + psnip_uint64_t seconds; + psnip_uint64_t nanoseconds; +}; + +/* Methods we support: */ + +#define PSNIP_CLOCK_METHOD_CLOCK_GETTIME 1 +#define PSNIP_CLOCK_METHOD_TIME 2 +#define PSNIP_CLOCK_METHOD_GETTIMEOFDAY 3 +#define PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER 4 +#define PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME 5 +#define PSNIP_CLOCK_METHOD_CLOCK 6 +#define PSNIP_CLOCK_METHOD_GETPROCESSTIMES 7 +#define PSNIP_CLOCK_METHOD_GETRUSAGE 8 +#define PSNIP_CLOCK_METHOD_GETSYSTEMTIMEPRECISEASFILETIME 9 +#define PSNIP_CLOCK_METHOD_GETTICKCOUNT64 10 + +#include + +#if defined(HEDLEY_UNREACHABLE) +# define PSNIP_CLOCK_UNREACHABLE() HEDLEY_UNREACHABLE() +#else +# define PSNIP_CLOCK_UNREACHABLE() assert(0) +#endif + +/* Choose an implementation */ + +/* #undef PSNIP_CLOCK_WALL_METHOD */ +/* #undef PSNIP_CLOCK_CPU_METHOD */ +/* #undef PSNIP_CLOCK_MONOTONIC_METHOD */ + +/* We want to be able to detect the libc implementation, so we include + ( isn't available everywhere). */ + +#if defined(__unix__) || defined(__unix) || defined(__linux__) +# include +# include +#endif + +#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) +/* These are known to work without librt. If you know of others + * please let us know so we can add them. */ +# if \ + (defined(__GLIBC__) && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17))) || \ + (defined(__FreeBSD__)) +# define PSNIP_CLOCK_HAVE_CLOCK_GETTIME +# elif !defined(PSNIP_CLOCK_NO_LIBRT) +# define PSNIP_CLOCK_HAVE_CLOCK_GETTIME +# endif +#endif + +#if defined(_WIN32) +# if !defined(PSNIP_CLOCK_CPU_METHOD) +# define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_GETPROCESSTIMES +# endif +# if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) +# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER +# endif +#endif + +#if defined(__MACH__) && !defined(__gnu_hurd__) +# if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) +# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME +# endif +#endif + +#if defined(PSNIP_CLOCK_HAVE_CLOCK_GETTIME) +# include +# if !defined(PSNIP_CLOCK_WALL_METHOD) +# if defined(CLOCK_REALTIME_PRECISE) +# define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME_PRECISE +# elif !defined(__sun) +# define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME +# endif +# endif +# if !defined(PSNIP_CLOCK_CPU_METHOD) +# if defined(_POSIX_CPUTIME) || defined(CLOCK_PROCESS_CPUTIME_ID) +# define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_PROCESS_CPUTIME_ID +# elif defined(CLOCK_VIRTUAL) +# define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_VIRTUAL +# endif +# endif +# if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) +# if defined(CLOCK_MONOTONIC_RAW) +# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC +# elif defined(CLOCK_MONOTONIC_PRECISE) +# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC_PRECISE +# elif defined(_POSIX_MONOTONIC_CLOCK) || defined(CLOCK_MONOTONIC) +# define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME +# define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC +# endif +# endif +#endif + +#if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 200112L) +# if !defined(PSNIP_CLOCK_WALL_METHOD) +# define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_GETTIMEOFDAY +# endif +#endif + +#if !defined(PSNIP_CLOCK_WALL_METHOD) +# define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_TIME +#endif + +#if !defined(PSNIP_CLOCK_CPU_METHOD) +# define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK +#endif + +/* Primarily here for testing. */ +#if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) && defined(PSNIP_CLOCK_REQUIRE_MONOTONIC) +# error No monotonic clock found. +#endif + +/* Implementations */ + +#if \ + (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ + (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \ + (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_TIME)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_TIME)) +# include +#endif + +#if \ + (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) +# include +#endif + +#if \ + (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \ + (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) +# include +#endif + +#if \ + (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE)) +# include +# include +#endif + +#if \ + (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) +# include +# include +# include +#endif + +/*** Implementations ***/ + +#define PSNIP_CLOCK_NSEC_PER_SEC ((psnip_uint32_t) (1000000000ULL)) + +#if \ + (defined(PSNIP_CLOCK_CPU_METHOD) && (PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ + (defined(PSNIP_CLOCK_WALL_METHOD) && (PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \ + (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) +PSNIP_CLOCK__FUNCTION psnip_uint32_t +psnip_clock__clock_getres (clockid_t clk_id) { + struct timespec res; + int r; + + r = clock_getres(clk_id, &res); + if (r != 0) + return 0; + + return (psnip_uint32_t) (PSNIP_CLOCK_NSEC_PER_SEC / res.tv_nsec); +} + +PSNIP_CLOCK__FUNCTION int +psnip_clock__clock_gettime (clockid_t clk_id, struct PsnipClockTimespec* res) { + struct timespec ts; + + if (clock_gettime(clk_id, &ts) != 0) + return -10; + + res->seconds = (psnip_uint64_t) (ts.tv_sec); + res->nanoseconds = (psnip_uint64_t) (ts.tv_nsec); + + return 0; +} +#endif + +PSNIP_CLOCK__FUNCTION psnip_uint32_t +psnip_clock_wall_get_precision (void) { +#if !defined(PSNIP_CLOCK_WALL_METHOD) + return 0; +#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME + return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_WALL); +#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY + return 1000000; +#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME + return 1; +#else + return 0; +#endif +} + +PSNIP_CLOCK__FUNCTION int +psnip_clock_wall_get_time (struct PsnipClockTimespec* res) { + (void) res; + +#if !defined(PSNIP_CLOCK_WALL_METHOD) + return -2; +#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME + return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_WALL, res); +#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME + res->seconds = time(NULL); + res->nanoseconds = 0; +#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY + struct timeval tv; + + if (gettimeofday(&tv, NULL) != 0) + return -6; + + res->seconds = tv.tv_sec; + res->nanoseconds = tv.tv_usec * 1000; +#else + return -2; +#endif + + return 0; +} + +PSNIP_CLOCK__FUNCTION psnip_uint32_t +psnip_clock_cpu_get_precision (void) { +#if !defined(PSNIP_CLOCK_CPU_METHOD) + return 0; +#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME + return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_CPU); +#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK + return CLOCKS_PER_SEC; +#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES + return PSNIP_CLOCK_NSEC_PER_SEC / 100; +#else + return 0; +#endif +} + +PSNIP_CLOCK__FUNCTION int +psnip_clock_cpu_get_time (struct PsnipClockTimespec* res) { +#if !defined(PSNIP_CLOCK_CPU_METHOD) + (void) res; + return -2; +#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME + return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_CPU, res); +#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK + clock_t t = clock(); + if (t == ((clock_t) -1)) + return -5; + res->seconds = t / CLOCKS_PER_SEC; + res->nanoseconds = (t % CLOCKS_PER_SEC) * (PSNIP_CLOCK_NSEC_PER_SEC / CLOCKS_PER_SEC); +#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES + FILETIME CreationTime, ExitTime, KernelTime, UserTime; + LARGE_INTEGER date, adjust; + + if (!GetProcessTimes(GetCurrentProcess(), &CreationTime, &ExitTime, &KernelTime, &UserTime)) + return -7; + + /* http://www.frenk.com/2009/12/convert-filetime-to-unix-timestamp/ */ + date.HighPart = UserTime.dwHighDateTime; + date.LowPart = UserTime.dwLowDateTime; + adjust.QuadPart = 11644473600000 * 10000; + date.QuadPart -= adjust.QuadPart; + + res->seconds = date.QuadPart / 10000000; + res->nanoseconds = (date.QuadPart % 10000000) * (PSNIP_CLOCK_NSEC_PER_SEC / 100); +#elif PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE + struct rusage usage; + if (getrusage(RUSAGE_SELF, &usage) != 0) + return -8; + + res->seconds = usage.ru_utime.tv_sec; + res->nanoseconds = tv.tv_usec * 1000; +#else + (void) res; + return -2; +#endif + + return 0; +} + +PSNIP_CLOCK__FUNCTION psnip_uint32_t +psnip_clock_monotonic_get_precision (void) { +#if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) + return 0; +#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME + return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC); +#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME + static mach_timebase_info_data_t tbi = { 0, }; + if (tbi.denom == 0) + mach_timebase_info(&tbi); + return (psnip_uint32_t) (tbi.numer / tbi.denom); +#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64 + return 1000; +#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER + LARGE_INTEGER Frequency; + QueryPerformanceFrequency(&Frequency); + return (psnip_uint32_t) ((Frequency.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC) ? PSNIP_CLOCK_NSEC_PER_SEC : Frequency.QuadPart); +#else + return 0; +#endif +} + +PSNIP_CLOCK__FUNCTION int +psnip_clock_monotonic_get_time (struct PsnipClockTimespec* res) { +#if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) + (void) res; + return -2; +#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME + return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC, res); +#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME + psnip_uint64_t nsec = mach_absolute_time(); + static mach_timebase_info_data_t tbi = { 0, }; + if (tbi.denom == 0) + mach_timebase_info(&tbi); + nsec *= ((psnip_uint64_t) tbi.numer) / ((psnip_uint64_t) tbi.denom); + res->seconds = nsec / PSNIP_CLOCK_NSEC_PER_SEC; + res->nanoseconds = nsec % PSNIP_CLOCK_NSEC_PER_SEC; +#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER + LARGE_INTEGER t, f; + if (QueryPerformanceCounter(&t) == 0) + return -12; + + QueryPerformanceFrequency(&f); + res->seconds = t.QuadPart / f.QuadPart; + res->nanoseconds = t.QuadPart % f.QuadPart; + if (f.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC) + res->nanoseconds /= f.QuadPart / PSNIP_CLOCK_NSEC_PER_SEC; + else + res->nanoseconds *= PSNIP_CLOCK_NSEC_PER_SEC / f.QuadPart; +#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64 + const ULONGLONG msec = GetTickCount64(); + res->seconds = msec / 1000; + res->nanoseconds = sec % 1000; +#else + return -2; +#endif + + return 0; +} + +/* Returns the number of ticks per second for the specified clock. + * For example, a clock with millisecond precision would return 1000, + * and a clock with 1 second (such as the time() function) would + * return 1. + * + * If the requested clock isn't available, it will return 0. + * Hopefully this will be rare, but if it happens to you please let us + * know so we can work on finding a way to support your system. + * + * Note that different clocks on the same system often have a + * different precisions. + */ +PSNIP_CLOCK__FUNCTION psnip_uint32_t +psnip_clock_get_precision (enum PsnipClockType clock_type) { + switch (clock_type) { + case PSNIP_CLOCK_TYPE_MONOTONIC: + return psnip_clock_monotonic_get_precision (); + case PSNIP_CLOCK_TYPE_CPU: + return psnip_clock_cpu_get_precision (); + case PSNIP_CLOCK_TYPE_WALL: + return psnip_clock_wall_get_precision (); + } + + PSNIP_CLOCK_UNREACHABLE(); + return 0; +} + +/* Set the provided timespec to the requested time. Returns 0 on + * success, or a negative value on failure. */ +PSNIP_CLOCK__FUNCTION int +psnip_clock_get_time (enum PsnipClockType clock_type, struct PsnipClockTimespec* res) { + assert(res != NULL); + + switch (clock_type) { + case PSNIP_CLOCK_TYPE_MONOTONIC: + return psnip_clock_monotonic_get_time (res); + case PSNIP_CLOCK_TYPE_CPU: + return psnip_clock_cpu_get_time (res); + case PSNIP_CLOCK_TYPE_WALL: + return psnip_clock_wall_get_time (res); + } + + return -1; +} + +#endif /* !defined(PSNIP_CLOCK_H) */ + +static psnip_uint64_t +munit_clock_get_elapsed(struct PsnipClockTimespec* start, struct PsnipClockTimespec* end) { + psnip_uint64_t r = (end->seconds - start->seconds) * PSNIP_CLOCK_NSEC_PER_SEC; + if (end->nanoseconds < start->nanoseconds) { + r -= (start->nanoseconds - end->nanoseconds); + } else { + r += (end->nanoseconds - start->nanoseconds); + } + return r; +} + +#else +# include +#endif /* defined(MUNIT_ENABLE_TIMING) */ + +/*** PRNG stuff ***/ + +/* This is (unless I screwed up, which is entirely possible) the + * version of PCG with 32-bit state. It was chosen because it has a + * small enough state that we should reliably be able to use CAS + * instead of requiring a lock for thread-safety. + * + * If I did screw up, I probably will not bother changing it unless + * there is a significant bias. It's really not important this be + * particularly strong, as long as it is fairly random it's much more + * important that it be reproducible, so bug reports have a better + * chance of being reproducible. */ + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) && !defined(__EMSCRIPTEN__) && (!defined(__GNUC_MINOR__) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8)) +# define HAVE_STDATOMIC +#elif defined(__clang__) +# if __has_extension(c_atomic) +# define HAVE_CLANG_ATOMICS +# endif +#endif + +/* Workaround for http://llvm.org/bugs/show_bug.cgi?id=26911 */ +#if defined(__clang__) && defined(_WIN32) +# undef HAVE_STDATOMIC +# if defined(__c2__) +# undef HAVE_CLANG_ATOMICS +# endif +#endif + +#if defined(_OPENMP) +# define ATOMIC_UINT32_T uint32_t +# define ATOMIC_UINT32_INIT(x) (x) +#elif defined(HAVE_STDATOMIC) +# include +# define ATOMIC_UINT32_T _Atomic uint32_t +# define ATOMIC_UINT32_INIT(x) ATOMIC_VAR_INIT(x) +#elif defined(HAVE_CLANG_ATOMICS) +# define ATOMIC_UINT32_T _Atomic uint32_t +# define ATOMIC_UINT32_INIT(x) (x) +#elif defined(_WIN32) +# define ATOMIC_UINT32_T volatile LONG +# define ATOMIC_UINT32_INIT(x) (x) +#else +# define ATOMIC_UINT32_T volatile uint32_t +# define ATOMIC_UINT32_INIT(x) (x) +#endif + +static ATOMIC_UINT32_T munit_rand_state = ATOMIC_UINT32_INIT(42); + +#if defined(_OPENMP) +static inline void +munit_atomic_store(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T value) { +#pragma omp critical (munit_atomics) + *dest = value; +} + +static inline uint32_t +munit_atomic_load(ATOMIC_UINT32_T* src) { + int ret; +#pragma omp critical (munit_atomics) + ret = *src; + return ret; +} + +static inline uint32_t +munit_atomic_cas(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T* expected, ATOMIC_UINT32_T desired) { + bool ret; + +#pragma omp critical (munit_atomics) + { + if (*dest == *expected) { + *dest = desired; + ret = true; + } else { + ret = false; + } + } + + return ret; +} +#elif defined(HAVE_STDATOMIC) +# define munit_atomic_store(dest, value) atomic_store(dest, value) +# define munit_atomic_load(src) atomic_load(src) +# define munit_atomic_cas(dest, expected, value) atomic_compare_exchange_weak(dest, expected, value) +#elif defined(HAVE_CLANG_ATOMICS) +# define munit_atomic_store(dest, value) __c11_atomic_store(dest, value, __ATOMIC_SEQ_CST) +# define munit_atomic_load(src) __c11_atomic_load(src, __ATOMIC_SEQ_CST) +# define munit_atomic_cas(dest, expected, value) __c11_atomic_compare_exchange_weak(dest, expected, value, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) +#elif defined(__GNUC__) && (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7) +# define munit_atomic_store(dest, value) __atomic_store_n(dest, value, __ATOMIC_SEQ_CST) +# define munit_atomic_load(src) __atomic_load_n(src, __ATOMIC_SEQ_CST) +# define munit_atomic_cas(dest, expected, value) __atomic_compare_exchange_n(dest, expected, value, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) +#elif defined(__GNUC__) && (__GNUC__ >= 4) +# define munit_atomic_store(dest,value) do { *(dest) = (value); } while (0) +# define munit_atomic_load(src) (*(src)) +# define munit_atomic_cas(dest, expected, value) __sync_bool_compare_and_swap(dest, *expected, value) +#elif defined(_WIN32) /* Untested */ +# define munit_atomic_store(dest,value) do { *(dest) = (value); } while (0) +# define munit_atomic_load(src) (*(src)) +# define munit_atomic_cas(dest, expected, value) InterlockedCompareExchange((dest), (value), *(expected)) +#else +# warning No atomic implementation, PRNG will not be thread-safe +# define munit_atomic_store(dest, value) do { *(dest) = (value); } while (0) +# define munit_atomic_load(src) (*(src)) +static inline bool +munit_atomic_cas(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T* expected, ATOMIC_UINT32_T desired) { + if (*dest == *expected) { + *dest = desired; + return true; + } else { + return false; + } +} +#endif + +#define MUNIT_PRNG_MULTIPLIER (747796405U) +#define MUNIT_PRNG_INCREMENT (1729U) + +static munit_uint32_t +munit_rand_next_state(munit_uint32_t state) { + return state * MUNIT_PRNG_MULTIPLIER + MUNIT_PRNG_INCREMENT; +} + +static munit_uint32_t +munit_rand_from_state(munit_uint32_t state) { + munit_uint32_t res = ((state >> ((state >> 28) + 4)) ^ state) * (277803737U); + res ^= res >> 22; + return res; +} + +void +munit_rand_seed(munit_uint32_t seed) { + munit_uint32_t state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT); + munit_atomic_store(&munit_rand_state, state); +} + +static munit_uint32_t +munit_rand_generate_seed(void) { + munit_uint32_t seed, state; +#if defined(MUNIT_ENABLE_TIMING) + struct PsnipClockTimespec wc = { 0, 0 }; + + psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wc); + seed = (munit_uint32_t) wc.nanoseconds; +#else + seed = (munit_uint32_t) time(NULL); +#endif + + state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT); + return munit_rand_from_state(state); +} + +static munit_uint32_t +munit_rand_state_uint32(munit_uint32_t* state) { + const munit_uint32_t old = *state; + *state = munit_rand_next_state(old); + return munit_rand_from_state(old); +} + +munit_uint32_t +munit_rand_uint32(void) { + munit_uint32_t old, state; + + do { + old = munit_atomic_load(&munit_rand_state); + state = munit_rand_next_state(old); + } while (!munit_atomic_cas(&munit_rand_state, &old, state)); + + return munit_rand_from_state(old); +} + +static void +munit_rand_state_memory(munit_uint32_t* state, size_t size, munit_uint8_t data[MUNIT_ARRAY_PARAM(size)]) { + size_t members_remaining = size / sizeof(munit_uint32_t); + size_t bytes_remaining = size % sizeof(munit_uint32_t); + munit_uint8_t* b = data; + munit_uint32_t rv; + while (members_remaining-- > 0) { + rv = munit_rand_state_uint32(state); + memcpy(b, &rv, sizeof(munit_uint32_t)); + b += sizeof(munit_uint32_t); + } + if (bytes_remaining != 0) { + rv = munit_rand_state_uint32(state); + memcpy(b, &rv, bytes_remaining); + } +} + +void +munit_rand_memory(size_t size, munit_uint8_t data[MUNIT_ARRAY_PARAM(size)]) { + munit_uint32_t old, state; + + do { + state = old = munit_atomic_load(&munit_rand_state); + munit_rand_state_memory(&state, size, data); + } while (!munit_atomic_cas(&munit_rand_state, &old, state)); +} + +static munit_uint32_t +munit_rand_state_at_most(munit_uint32_t* state, munit_uint32_t salt, munit_uint32_t max) { + /* We want (UINT32_MAX + 1) % max, which in unsigned arithmetic is the same + * as (UINT32_MAX + 1 - max) % max = -max % max. We compute -max using not + * to avoid compiler warnings. + */ + const munit_uint32_t min = (~max + 1U) % max; + munit_uint32_t x; + + if (max == (~((munit_uint32_t) 0U))) + return munit_rand_state_uint32(state) ^ salt; + + max++; + + do { + x = munit_rand_state_uint32(state) ^ salt; + } while (x < min); + + return x % max; +} + +static munit_uint32_t +munit_rand_at_most(munit_uint32_t salt, munit_uint32_t max) { + munit_uint32_t old, state; + munit_uint32_t retval; + + do { + state = old = munit_atomic_load(&munit_rand_state); + retval = munit_rand_state_at_most(&state, salt, max); + } while (!munit_atomic_cas(&munit_rand_state, &old, state)); + + return retval; +} + +int +munit_rand_int_range(int min, int max) { + munit_uint64_t range = (munit_uint64_t) max - (munit_uint64_t) min; + + if (min > max) + return munit_rand_int_range(max, min); + + if (range > (~((munit_uint32_t) 0U))) + range = (~((munit_uint32_t) 0U)); + + return min + munit_rand_at_most(0, (munit_uint32_t) range); +} + +double +munit_rand_double(void) { + munit_uint32_t old, state; + double retval = 0.0; + + do { + state = old = munit_atomic_load(&munit_rand_state); + + /* See http://mumble.net/~campbell/tmp/random_real.c for how to do + * this right. Patches welcome if you feel that this is too + * biased. */ + retval = munit_rand_state_uint32(&state) / ((~((munit_uint32_t) 0U)) + 1.0); + } while (!munit_atomic_cas(&munit_rand_state, &old, state)); + + return retval; +} + +/*** Test suite handling ***/ + +typedef struct { + unsigned int successful; + unsigned int skipped; + unsigned int failed; + unsigned int errored; +#if defined(MUNIT_ENABLE_TIMING) + munit_uint64_t cpu_clock; + munit_uint64_t wall_clock; +#endif +} MunitReport; + +typedef struct { + const char* prefix; + const MunitSuite* suite; + const char** tests; + munit_uint32_t seed; + unsigned int iterations; + MunitParameter* parameters; + bool single_parameter_mode; + void* user_data; + MunitReport report; + bool colorize; + bool fork; + bool show_stderr; + bool fatal_failures; +} MunitTestRunner; + +const char* +munit_parameters_get(const MunitParameter params[], const char* key) { + const MunitParameter* param; + + for (param = params ; param != NULL && param->name != NULL ; param++) + if (strcmp(param->name, key) == 0) + return param->value; + return NULL; +} + +#if defined(MUNIT_ENABLE_TIMING) +static void +munit_print_time(FILE* fp, munit_uint64_t nanoseconds) { + fprintf(fp, "%" MUNIT_TEST_TIME_FORMAT, ((double) nanoseconds) / ((double) PSNIP_CLOCK_NSEC_PER_SEC)); +} +#endif + +/* Add a parameter to an array of parameters. */ +static MunitResult +munit_parameters_add(size_t* params_size, MunitParameter* params[MUNIT_ARRAY_PARAM(*params_size)], char* name, char* value) { + *params = realloc(*params, sizeof(MunitParameter) * (*params_size + 2)); + if (*params == NULL) + return MUNIT_ERROR; + + (*params)[*params_size].name = name; + (*params)[*params_size].value = value; + (*params_size)++; + (*params)[*params_size].name = NULL; + (*params)[*params_size].value = NULL; + + return MUNIT_OK; +} + +/* Concatenate two strings, but just return one of the components + * unaltered if the other is NULL or "". */ +static char* +munit_maybe_concat(size_t* len, char* prefix, char* suffix) { + char* res; + size_t res_l; + const size_t prefix_l = prefix != NULL ? strlen(prefix) : 0; + const size_t suffix_l = suffix != NULL ? strlen(suffix) : 0; + if (prefix_l == 0 && suffix_l == 0) { + res = NULL; + res_l = 0; + } else if (prefix_l == 0 && suffix_l != 0) { + res = suffix; + res_l = suffix_l; + } else if (prefix_l != 0 && suffix_l == 0) { + res = prefix; + res_l = prefix_l; + } else { + res_l = prefix_l + suffix_l; + res = malloc(res_l + 1); + memcpy(res, prefix, prefix_l); + memcpy(res + prefix_l, suffix, suffix_l); + res[res_l] = 0; + } + + if (len != NULL) + *len = res_l; + + return res; +} + +/* Possibly free a string returned by munit_maybe_concat. */ +static void +munit_maybe_free_concat(char* s, const char* prefix, const char* suffix) { + if (prefix != s && suffix != s) + free(s); +} + +/* Cheap string hash function, just used to salt the PRNG. */ +static munit_uint32_t +munit_str_hash(const char* name) { + const char *p; + munit_uint32_t h = 5381U; + + for (p = name; *p != '\0'; p++) + h = (h << 5) + h + *p; + + return h; +} + +static void +munit_splice(int from, int to) { + munit_uint8_t buf[1024]; +#if !defined(_WIN32) + ssize_t len; + ssize_t bytes_written; + ssize_t write_res; +#else + int len; + int bytes_written; + int write_res; +#endif + do { + len = read(from, buf, sizeof(buf)); + if (len > 0) { + bytes_written = 0; + do { + write_res = write(to, buf + bytes_written, len - bytes_written); + if (write_res < 0) + break; + bytes_written += write_res; + } while (bytes_written < len); + } + else + break; + } while (true); +} + +/* This is the part that should be handled in the child process */ +static MunitResult +munit_test_runner_exec(MunitTestRunner* runner, const MunitTest* test, const MunitParameter params[], MunitReport* report) { + unsigned int iterations = runner->iterations; + MunitResult result = MUNIT_FAIL; +#if defined(MUNIT_ENABLE_TIMING) + struct PsnipClockTimespec wall_clock_begin = { 0, 0 }, wall_clock_end = { 0, 0 }; + struct PsnipClockTimespec cpu_clock_begin = { 0, 0 }, cpu_clock_end = { 0, 0 }; +#endif + unsigned int i = 0; + + if ((test->options & MUNIT_TEST_OPTION_SINGLE_ITERATION) == MUNIT_TEST_OPTION_SINGLE_ITERATION) + iterations = 1; + else if (iterations == 0) + iterations = runner->suite->iterations; + + munit_rand_seed(runner->seed); + + do { + void* data = (test->setup == NULL) ? runner->user_data : test->setup(params, runner->user_data); + +#if defined(MUNIT_ENABLE_TIMING) + psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_begin); + psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_begin); +#endif + +#if defined(MUNIT_THREAD_LOCAL) && defined(MUNIT_ALWAYS_TEAR_DOWN) + if (test->tear_down != NULL) { + if (MUNIT_UNLIKELY(setjmp(munit_tear_down_jmp_buf) != 0)) { + test->tear_down(data); + longjmp(munit_error_jmp_buf, 1); + } else { + munit_tear_down_jmp_buf_valid = true; + } + } +#endif + + result = test->test(params, data); + +#if defined(MUNIT_ENABLE_TIMING) + psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_end); + psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_end); +#endif + + if (test->tear_down != NULL) + test->tear_down(data); + + if (MUNIT_LIKELY(result == MUNIT_OK)) { + report->successful++; +#if defined(MUNIT_ENABLE_TIMING) + report->wall_clock += munit_clock_get_elapsed(&wall_clock_begin, &wall_clock_end); + report->cpu_clock += munit_clock_get_elapsed(&cpu_clock_begin, &cpu_clock_end); +#endif + } else { + switch ((int) result) { + case MUNIT_SKIP: + report->skipped++; + break; + case MUNIT_FAIL: + report->failed++; + break; + case MUNIT_ERROR: + report->errored++; + break; + default: + break; + } + break; + } + } while (++i < iterations); + + return result; +} + +#if defined(MUNIT_EMOTICON) +# define MUNIT_RESULT_STRING_OK ":)" +# define MUNIT_RESULT_STRING_SKIP ":|" +# define MUNIT_RESULT_STRING_FAIL ":(" +# define MUNIT_RESULT_STRING_ERROR ":o" +# define MUNIT_RESULT_STRING_TODO ":/" +#else +# define MUNIT_RESULT_STRING_OK "OK " +# define MUNIT_RESULT_STRING_SKIP "SKIP " +# define MUNIT_RESULT_STRING_FAIL "FAIL " +# define MUNIT_RESULT_STRING_ERROR "ERROR" +# define MUNIT_RESULT_STRING_TODO "TODO " +#endif + +static void +munit_test_runner_print_color(const MunitTestRunner* runner, const char* string, char color) { + if (runner->colorize) + fprintf(MUNIT_OUTPUT_FILE, "\x1b[3%cm%s\x1b[39m", color, string); + else + fputs(string, MUNIT_OUTPUT_FILE); +} + +#if !defined(MUNIT_NO_BUFFER) +static int +munit_replace_stderr(FILE* stderr_buf) { + if (stderr_buf != NULL) { + const int orig_stderr = dup(STDERR_FILENO); + + int errfd = fileno(stderr_buf); + if (MUNIT_UNLIKELY(errfd == -1)) { + exit(EXIT_FAILURE); + } + + dup2(errfd, STDERR_FILENO); + + return orig_stderr; + } + + return -1; +} + +static void +munit_restore_stderr(int orig_stderr) { + if (orig_stderr != -1) { + dup2(orig_stderr, STDERR_FILENO); + close(orig_stderr); + } +} +#endif /* !defined(MUNIT_NO_BUFFER) */ + +/* Run a test with the specified parameters. */ +static void +munit_test_runner_run_test_with_params(MunitTestRunner* runner, const MunitTest* test, const MunitParameter params[]) { + MunitResult result = MUNIT_OK; + MunitReport report = { + 0, 0, 0, 0, +#if defined(MUNIT_ENABLE_TIMING) + 0, 0 +#endif + }; + unsigned int output_l; + bool first; + const MunitParameter* param; + FILE* stderr_buf; +#if !defined(MUNIT_NO_FORK) + int pipefd[2]; + pid_t fork_pid; + ssize_t bytes_written = 0; + ssize_t write_res; + ssize_t bytes_read = 0; + ssize_t read_res; + int status = 0; + pid_t changed_pid; +#endif + + if (params != NULL) { + output_l = 2; + fputs(" ", MUNIT_OUTPUT_FILE); + first = true; + for (param = params ; param != NULL && param->name != NULL ; param++) { + if (!first) { + fputs(", ", MUNIT_OUTPUT_FILE); + output_l += 2; + } else { + first = false; + } + + output_l += fprintf(MUNIT_OUTPUT_FILE, "%s=%s", param->name, param->value); + } + while (output_l++ < MUNIT_TEST_NAME_LEN) { + fputc(' ', MUNIT_OUTPUT_FILE); + } + } + + fflush(MUNIT_OUTPUT_FILE); + + stderr_buf = NULL; +#if !defined(_WIN32) || defined(__MINGW32__) + stderr_buf = tmpfile(); +#else + tmpfile_s(&stderr_buf); +#endif + if (stderr_buf == NULL) { + munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to create buffer for stderr"); + result = MUNIT_ERROR; + goto print_result; + } + +#if !defined(MUNIT_NO_FORK) + if (runner->fork) { + pipefd[0] = -1; + pipefd[1] = -1; + if (pipe(pipefd) != 0) { + munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to create pipe"); + result = MUNIT_ERROR; + goto print_result; + } + + fork_pid = fork(); + if (fork_pid == 0) { + int orig_stderr; + + close(pipefd[0]); + + orig_stderr = munit_replace_stderr(stderr_buf); + munit_test_runner_exec(runner, test, params, &report); + + /* Note that we don't restore stderr. This is so we can buffer + * things written to stderr later on (such as by + * asan/tsan/ubsan, valgrind, etc.) */ + close(orig_stderr); + + do { + write_res = write(pipefd[1], ((munit_uint8_t*) (&report)) + bytes_written, sizeof(report) - bytes_written); + if (write_res < 0) { + if (stderr_buf != NULL) { + munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to write to pipe"); + } + exit(EXIT_FAILURE); + } + bytes_written += write_res; + } while ((size_t) bytes_written < sizeof(report)); + + if (stderr_buf != NULL) + fclose(stderr_buf); + close(pipefd[1]); + + exit(EXIT_SUCCESS); + } else if (fork_pid == -1) { + close(pipefd[0]); + close(pipefd[1]); + if (stderr_buf != NULL) { + munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to fork"); + } + report.errored++; + result = MUNIT_ERROR; + } else { + close(pipefd[1]); + do { + read_res = read(pipefd[0], ((munit_uint8_t*) (&report)) + bytes_read, sizeof(report) - bytes_read); + if (read_res < 1) + break; + bytes_read += read_res; + } while (bytes_read < (ssize_t) sizeof(report)); + + changed_pid = waitpid(fork_pid, &status, 0); + + if (MUNIT_LIKELY(changed_pid == fork_pid) && MUNIT_LIKELY(WIFEXITED(status))) { + if (bytes_read != sizeof(report)) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child exited unexpectedly with status %d", WEXITSTATUS(status)); + report.errored++; + } else if (WEXITSTATUS(status) != EXIT_SUCCESS) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child exited with status %d", WEXITSTATUS(status)); + report.errored++; + } + } else { + if (WIFSIGNALED(status)) { +#if defined(_XOPEN_VERSION) && (_XOPEN_VERSION >= 700) + munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child killed by signal %d (%s)", WTERMSIG(status), strsignal(WTERMSIG(status))); +#else + munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child killed by signal %d", WTERMSIG(status)); +#endif + } else if (WIFSTOPPED(status)) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child stopped by signal %d", WSTOPSIG(status)); + } + report.errored++; + } + + close(pipefd[0]); + waitpid(fork_pid, NULL, 0); + } + } else +#endif + { +#if !defined(MUNIT_NO_BUFFER) + const volatile int orig_stderr = munit_replace_stderr(stderr_buf); +#endif + +#if defined(MUNIT_THREAD_LOCAL) + if (MUNIT_UNLIKELY(setjmp(munit_error_jmp_buf) != 0)) { + result = MUNIT_FAIL; + report.failed++; + } else { + munit_error_jmp_buf_valid = true; + result = munit_test_runner_exec(runner, test, params, &report); + } +#else + result = munit_test_runner_exec(runner, test, params, &report); +#endif + +#if !defined(MUNIT_NO_BUFFER) + munit_restore_stderr(orig_stderr); +#endif + + /* Here just so that the label is used on Windows and we don't get + * a warning */ + goto print_result; + } + + print_result: + + fputs("[ ", MUNIT_OUTPUT_FILE); + if ((test->options & MUNIT_TEST_OPTION_TODO) == MUNIT_TEST_OPTION_TODO) { + if (report.failed != 0 || report.errored != 0 || report.skipped != 0) { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_TODO, '3'); + result = MUNIT_OK; + } else { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1'); + if (MUNIT_LIKELY(stderr_buf != NULL)) + munit_log_internal(MUNIT_LOG_ERROR, stderr_buf, "Test marked TODO, but was successful."); + runner->report.failed++; + result = MUNIT_ERROR; + } + } else if (report.failed > 0) { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_FAIL, '1'); + runner->report.failed++; + result = MUNIT_FAIL; + } else if (report.errored > 0) { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1'); + runner->report.errored++; + result = MUNIT_ERROR; + } else if (report.skipped > 0) { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_SKIP, '3'); + runner->report.skipped++; + result = MUNIT_SKIP; + } else if (report.successful > 1) { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2'); +#if defined(MUNIT_ENABLE_TIMING) + fputs(" ] [ ", MUNIT_OUTPUT_FILE); + munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock / report.successful); + fputs(" / ", MUNIT_OUTPUT_FILE); + munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock / report.successful); + fprintf(MUNIT_OUTPUT_FILE, " CPU ]\n %-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s Total: [ ", ""); + munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock); + fputs(" / ", MUNIT_OUTPUT_FILE); + munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock); + fputs(" CPU", MUNIT_OUTPUT_FILE); +#endif + runner->report.successful++; + result = MUNIT_OK; + } else if (report.successful > 0) { + munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2'); +#if defined(MUNIT_ENABLE_TIMING) + fputs(" ] [ ", MUNIT_OUTPUT_FILE); + munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock); + fputs(" / ", MUNIT_OUTPUT_FILE); + munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock); + fputs(" CPU", MUNIT_OUTPUT_FILE); +#endif + runner->report.successful++; + result = MUNIT_OK; + } + fputs(" ]\n", MUNIT_OUTPUT_FILE); + + if (stderr_buf != NULL) { + if (result == MUNIT_FAIL || result == MUNIT_ERROR || runner->show_stderr) { + fflush(MUNIT_OUTPUT_FILE); + + rewind(stderr_buf); + munit_splice(fileno(stderr_buf), STDERR_FILENO); + + fflush(stderr); + } + + fclose(stderr_buf); + } +} + +static void +munit_test_runner_run_test_wild(MunitTestRunner* runner, + const MunitTest* test, + const char* test_name, + MunitParameter* params, + MunitParameter* p) { + const MunitParameterEnum* pe; + char** values; + MunitParameter* next; + + for (pe = test->parameters ; pe != NULL && pe->name != NULL ; pe++) { + if (p->name == pe->name) + break; + } + + if (pe == NULL) + return; + + for (values = pe->values ; *values != NULL ; values++) { + next = p + 1; + p->value = *values; + if (next->name == NULL) { + munit_test_runner_run_test_with_params(runner, test, params); + } else { + munit_test_runner_run_test_wild(runner, test, test_name, params, next); + } + if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0)) + break; + } +} + +/* Run a single test, with every combination of parameters + * requested. */ +static void +munit_test_runner_run_test(MunitTestRunner* runner, + const MunitTest* test, + const char* prefix) { + char* test_name = munit_maybe_concat(NULL, (char*) prefix, (char*) test->name); + /* The array of parameters to pass to + * munit_test_runner_run_test_with_params */ + MunitParameter* params = NULL; + size_t params_l = 0; + /* Wildcard parameters are parameters which have possible values + * specified in the test, but no specific value was passed to the + * CLI. That means we want to run the test once for every + * possible combination of parameter values or, if --single was + * passed to the CLI, a single time with a random set of + * parameters. */ + MunitParameter* wild_params = NULL; + size_t wild_params_l = 0; + const MunitParameterEnum* pe; + const MunitParameter* cli_p; + bool filled; + unsigned int possible; + char** vals; + size_t first_wild; + const MunitParameter* wp; + int pidx; + + munit_rand_seed(runner->seed); + + fprintf(MUNIT_OUTPUT_FILE, "%-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s", test_name); + + if (test->parameters == NULL) { + /* No parameters. Simple, nice. */ + munit_test_runner_run_test_with_params(runner, test, NULL); + } else { + fputc('\n', MUNIT_OUTPUT_FILE); + + for (pe = test->parameters ; pe != NULL && pe->name != NULL ; pe++) { + /* Did we received a value for this parameter from the CLI? */ + filled = false; + for (cli_p = runner->parameters ; cli_p != NULL && cli_p->name != NULL ; cli_p++) { + if (strcmp(cli_p->name, pe->name) == 0) { + if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, cli_p->value) != MUNIT_OK)) + goto cleanup; + filled = true; + break; + } + } + if (filled) + continue; + + /* Nothing from CLI, is the enum NULL/empty? We're not a + * fuzzer… */ + if (pe->values == NULL || pe->values[0] == NULL) + continue; + + /* If --single was passed to the CLI, choose a value from the + * list of possibilities randomly. */ + if (runner->single_parameter_mode) { + possible = 0; + for (vals = pe->values ; *vals != NULL ; vals++) + possible++; + /* We want the tests to be reproducible, even if you're only + * running a single test, but we don't want every test with + * the same number of parameters to choose the same parameter + * number, so use the test name as a primitive salt. */ + pidx = munit_rand_at_most(munit_str_hash(test_name), possible - 1); + if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, pe->values[pidx]) != MUNIT_OK)) + goto cleanup; + } else { + /* We want to try every permutation. Put in a placeholder + * entry, we'll iterate through them later. */ + if (MUNIT_UNLIKELY(munit_parameters_add(&wild_params_l, &wild_params, pe->name, NULL) != MUNIT_OK)) + goto cleanup; + } + } + + if (wild_params_l != 0) { + first_wild = params_l; + for (wp = wild_params ; wp != NULL && wp->name != NULL ; wp++) { + for (pe = test->parameters ; pe != NULL && pe->name != NULL && pe->values != NULL ; pe++) { + if (strcmp(wp->name, pe->name) == 0) { + if (MUNIT_UNLIKELY(munit_parameters_add(¶ms_l, ¶ms, pe->name, pe->values[0]) != MUNIT_OK)) + goto cleanup; + } + } + } + + munit_test_runner_run_test_wild(runner, test, test_name, params, params + first_wild); + } else { + munit_test_runner_run_test_with_params(runner, test, params); + } + + cleanup: + free(params); + free(wild_params); + } + + munit_maybe_free_concat(test_name, prefix, test->name); +} + +/* Recurse through the suite and run all the tests. If a list of + * tests to run was provided on the command line, run only those + * tests. */ +static void +munit_test_runner_run_suite(MunitTestRunner* runner, + const MunitSuite* suite, + const char* prefix) { + size_t pre_l; + char* pre = munit_maybe_concat(&pre_l, (char*) prefix, (char*) suite->prefix); + const MunitTest* test; + const char** test_name; + const MunitSuite* child_suite; + + /* Run the tests. */ + for (test = suite->tests ; test != NULL && test->test != NULL ; test++) { + if (runner->tests != NULL) { /* Specific tests were requested on the CLI */ + for (test_name = runner->tests ; test_name != NULL && *test_name != NULL ; test_name++) { + if ((pre_l == 0 || strncmp(pre, *test_name, pre_l) == 0) && + strncmp(test->name, *test_name + pre_l, strlen(*test_name + pre_l)) == 0) { + munit_test_runner_run_test(runner, test, pre); + if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0)) + goto cleanup; + } + } + } else { /* Run all tests */ + munit_test_runner_run_test(runner, test, pre); + } + } + + if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0)) + goto cleanup; + + /* Run any child suites. */ + for (child_suite = suite->suites ; child_suite != NULL && child_suite->prefix != NULL ; child_suite++) { + munit_test_runner_run_suite(runner, child_suite, pre); + } + + cleanup: + + munit_maybe_free_concat(pre, prefix, suite->prefix); +} + +static void +munit_test_runner_run(MunitTestRunner* runner) { + munit_test_runner_run_suite(runner, runner->suite, NULL); +} + +static void +munit_print_help(int argc, char* const argv[MUNIT_ARRAY_PARAM(argc + 1)], void* user_data, const MunitArgument arguments[]) { + const MunitArgument* arg; + (void) argc; + + printf("USAGE: %s [OPTIONS...] [TEST...]\n\n", argv[0]); + puts(" --seed SEED\n" + " Value used to seed the PRNG. Must be a 32-bit integer in decimal\n" + " notation with no separators (commas, decimals, spaces, etc.), or\n" + " hexadecimal prefixed by \"0x\".\n" + " --iterations N\n" + " Run each test N times. 0 means the default number.\n" + " --param name value\n" + " A parameter key/value pair which will be passed to any test with\n" + " takes a parameter of that name. If not provided, the test will be\n" + " run once for each possible parameter value.\n" + " --list Write a list of all available tests.\n" + " --list-params\n" + " Write a list of all available tests and their possible parameters.\n" + " --single Run each parameterized test in a single configuration instead of\n" + " every possible combination\n" + " --log-visible debug|info|warning|error\n" + " --log-fatal debug|info|warning|error\n" + " Set the level at which messages of different severities are visible,\n" + " or cause the test to terminate.\n" +#if !defined(MUNIT_NO_FORK) + " --no-fork Do not execute tests in a child process. If this option is supplied\n" + " and a test crashes (including by failing an assertion), no further\n" + " tests will be performed.\n" +#endif + " --fatal-failures\n" + " Stop executing tests as soon as a failure is found.\n" + " --show-stderr\n" + " Show data written to stderr by the tests, even if the test succeeds.\n" + " --color auto|always|never\n" + " Colorize (or don't) the output.\n" + /* 12345678901234567890123456789012345678901234567890123456789012345678901234567890 */ + " --help Print this help message and exit.\n"); +#if defined(MUNIT_NL_LANGINFO) + setlocale(LC_ALL, ""); + fputs((strcasecmp("UTF-8", nl_langinfo(CODESET)) == 0) ? "µnit" : "munit", stdout); +#else + puts("munit"); +#endif + printf(" %d.%d.%d\n" + "Full documentation at: https://nemequ.github.io/munit/\n", + (MUNIT_CURRENT_VERSION >> 16) & 0xff, + (MUNIT_CURRENT_VERSION >> 8) & 0xff, + (MUNIT_CURRENT_VERSION >> 0) & 0xff); + for (arg = arguments ; arg != NULL && arg->name != NULL ; arg++) + arg->write_help(arg, user_data); +} + +static const MunitArgument* +munit_arguments_find(const MunitArgument arguments[], const char* name) { + const MunitArgument* arg; + + for (arg = arguments ; arg != NULL && arg->name != NULL ; arg++) + if (strcmp(arg->name, name) == 0) + return arg; + + return NULL; +} + +static void +munit_suite_list_tests(const MunitSuite* suite, bool show_params, const char* prefix) { + size_t pre_l; + char* pre = munit_maybe_concat(&pre_l, (char*) prefix, (char*) suite->prefix); + const MunitTest* test; + const MunitParameterEnum* params; + bool first; + char** val; + const MunitSuite* child_suite; + + for (test = suite->tests ; + test != NULL && test->name != NULL ; + test++) { + if (pre != NULL) + fputs(pre, stdout); + puts(test->name); + + if (show_params) { + for (params = test->parameters ; + params != NULL && params->name != NULL ; + params++) { + fprintf(stdout, " - %s: ", params->name); + if (params->values == NULL) { + puts("Any"); + } else { + first = true; + for (val = params->values ; + *val != NULL ; + val++ ) { + if(!first) { + fputs(", ", stdout); + } else { + first = false; + } + fputs(*val, stdout); + } + putc('\n', stdout); + } + } + } + } + + for (child_suite = suite->suites ; child_suite != NULL && child_suite->prefix != NULL ; child_suite++) { + munit_suite_list_tests(child_suite, show_params, pre); + } + + munit_maybe_free_concat(pre, prefix, suite->prefix); +} + +static bool +munit_stream_supports_ansi(FILE *stream) { +#if !defined(_WIN32) + return isatty(fileno(stream)); +#else + +#if !defined(__MINGW32__) + size_t ansicon_size = 0; +#endif + + if (isatty(fileno(stream))) { +#if !defined(__MINGW32__) + getenv_s(&ansicon_size, NULL, 0, "ANSICON"); + return ansicon_size != 0; +#else + return getenv("ANSICON") != NULL; +#endif + } + return false; +#endif +} + +int +munit_suite_main_custom(const MunitSuite* suite, void* user_data, + int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)], + const MunitArgument arguments[]) { + int result = EXIT_FAILURE; + MunitTestRunner runner; + size_t parameters_size = 0; + size_t tests_size = 0; + int arg; + + char* envptr; + unsigned long ts; + char* endptr; + unsigned long long iterations; + MunitLogLevel level; + const MunitArgument* argument; + const char** runner_tests; + unsigned int tests_run; + unsigned int tests_total; + + runner.prefix = NULL; + runner.suite = NULL; + runner.tests = NULL; + runner.seed = 0; + runner.iterations = 0; + runner.parameters = NULL; + runner.single_parameter_mode = false; + runner.user_data = NULL; + + runner.report.successful = 0; + runner.report.skipped = 0; + runner.report.failed = 0; + runner.report.errored = 0; +#if defined(MUNIT_ENABLE_TIMING) + runner.report.cpu_clock = 0; + runner.report.wall_clock = 0; +#endif + + runner.colorize = false; +#if !defined(_WIN32) + runner.fork = true; +#else + runner.fork = false; +#endif + runner.show_stderr = false; + runner.fatal_failures = false; + runner.suite = suite; + runner.user_data = user_data; + runner.seed = munit_rand_generate_seed(); + runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE); + + for (arg = 1 ; arg < argc ; arg++) { + if (strncmp("--", argv[arg], 2) == 0) { + if (strcmp("seed", argv[arg] + 2) == 0) { + if (arg + 1 >= argc) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]); + goto cleanup; + } + + envptr = argv[arg + 1]; + ts = strtoul(argv[arg + 1], &envptr, 0); + if (*envptr != '\0' || ts > (~((munit_uint32_t) 0U))) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]); + goto cleanup; + } + runner.seed = (munit_uint32_t) ts; + + arg++; + } else if (strcmp("iterations", argv[arg] + 2) == 0) { + if (arg + 1 >= argc) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]); + goto cleanup; + } + + endptr = argv[arg + 1]; + iterations = strtoul(argv[arg + 1], &endptr, 0); + if (*endptr != '\0' || iterations > UINT_MAX) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]); + goto cleanup; + } + + runner.iterations = (unsigned int) iterations; + + arg++; + } else if (strcmp("param", argv[arg] + 2) == 0) { + if (arg + 2 >= argc) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires two arguments", argv[arg]); + goto cleanup; + } + + runner.parameters = realloc(runner.parameters, sizeof(MunitParameter) * (parameters_size + 2)); + if (runner.parameters == NULL) { + munit_log_internal(MUNIT_LOG_ERROR, stderr, "failed to allocate memory"); + goto cleanup; + } + runner.parameters[parameters_size].name = (char*) argv[arg + 1]; + runner.parameters[parameters_size].value = (char*) argv[arg + 2]; + parameters_size++; + runner.parameters[parameters_size].name = NULL; + runner.parameters[parameters_size].value = NULL; + arg += 2; + } else if (strcmp("color", argv[arg] + 2) == 0) { + if (arg + 1 >= argc) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]); + goto cleanup; + } + + if (strcmp(argv[arg + 1], "always") == 0) + runner.colorize = true; + else if (strcmp(argv[arg + 1], "never") == 0) + runner.colorize = false; + else if (strcmp(argv[arg + 1], "auto") == 0) + runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE); + else { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]); + goto cleanup; + } + + arg++; + } else if (strcmp("help", argv[arg] + 2) == 0) { + munit_print_help(argc, argv, user_data, arguments); + result = EXIT_SUCCESS; + goto cleanup; + } else if (strcmp("single", argv[arg] + 2) == 0) { + runner.single_parameter_mode = true; + } else if (strcmp("show-stderr", argv[arg] + 2) == 0) { + runner.show_stderr = true; +#if !defined(_WIN32) + } else if (strcmp("no-fork", argv[arg] + 2) == 0) { + runner.fork = false; +#endif + } else if (strcmp("fatal-failures", argv[arg] + 2) == 0) { + runner.fatal_failures = true; + } else if (strcmp("log-visible", argv[arg] + 2) == 0 || + strcmp("log-fatal", argv[arg] + 2) == 0) { + if (arg + 1 >= argc) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]); + goto cleanup; + } + + if (strcmp(argv[arg + 1], "debug") == 0) + level = MUNIT_LOG_DEBUG; + else if (strcmp(argv[arg + 1], "info") == 0) + level = MUNIT_LOG_INFO; + else if (strcmp(argv[arg + 1], "warning") == 0) + level = MUNIT_LOG_WARNING; + else if (strcmp(argv[arg + 1], "error") == 0) + level = MUNIT_LOG_ERROR; + else { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]); + goto cleanup; + } + + if (strcmp("log-visible", argv[arg] + 2) == 0) + munit_log_level_visible = level; + else + munit_log_level_fatal = level; + + arg++; + } else if (strcmp("list", argv[arg] + 2) == 0) { + munit_suite_list_tests(suite, false, NULL); + result = EXIT_SUCCESS; + goto cleanup; + } else if (strcmp("list-params", argv[arg] + 2) == 0) { + munit_suite_list_tests(suite, true, NULL); + result = EXIT_SUCCESS; + goto cleanup; + } else { + argument = munit_arguments_find(arguments, argv[arg] + 2); + if (argument == NULL) { + munit_logf_internal(MUNIT_LOG_ERROR, stderr, "unknown argument ('%s')", argv[arg]); + goto cleanup; + } + + if (!argument->parse_argument(suite, user_data, &arg, argc, argv)) + goto cleanup; + } + } else { + runner_tests = realloc((void*) runner.tests, sizeof(char*) * (tests_size + 2)); + if (runner_tests == NULL) { + munit_log_internal(MUNIT_LOG_ERROR, stderr, "failed to allocate memory"); + goto cleanup; + } + runner.tests = runner_tests; + runner.tests[tests_size++] = argv[arg]; + runner.tests[tests_size] = NULL; + } + } + + fflush(stderr); + fprintf(MUNIT_OUTPUT_FILE, "Running test suite with seed 0x%08" PRIx32 "...\n", runner.seed); + + munit_test_runner_run(&runner); + + tests_run = runner.report.successful + runner.report.failed + runner.report.errored; + tests_total = tests_run + runner.report.skipped; + if (tests_run == 0) { + fprintf(stderr, "No tests run, %d (100%%) skipped.\n", runner.report.skipped); + } else { + fprintf(MUNIT_OUTPUT_FILE, "%d of %d (%0.0f%%) tests successful, %d (%0.0f%%) test skipped.\n", + runner.report.successful, tests_run, + (((double) runner.report.successful) / ((double) tests_run)) * 100.0, + runner.report.skipped, + (((double) runner.report.skipped) / ((double) tests_total)) * 100.0); + } + + if (runner.report.failed == 0 && runner.report.errored == 0) { + result = EXIT_SUCCESS; + } + + cleanup: + free(runner.parameters); + free((void*) runner.tests); + + return result; +} + +int +munit_suite_main(const MunitSuite* suite, void* user_data, + int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]) { + return munit_suite_main_custom(suite, user_data, argc, argv, NULL); +} diff --git a/test/raft/lib/munit.h b/test/raft/lib/munit.h new file mode 100644 index 000000000..0b6796b4b --- /dev/null +++ b/test/raft/lib/munit.h @@ -0,0 +1,535 @@ +/* µnit Testing Framework + * Copyright (c) 2013-2017 Evan Nemerson + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(MUNIT_H) +#define MUNIT_H + +#include +#include + +#define MUNIT_VERSION(major, minor, revision) \ + (((major) << 16) | ((minor) << 8) | (revision)) + +#define MUNIT_CURRENT_VERSION MUNIT_VERSION(0, 4, 1) + +#if defined(_MSC_VER) && (_MSC_VER < 1600) +# define munit_int8_t __int8 +# define munit_uint8_t unsigned __int8 +# define munit_int16_t __int16 +# define munit_uint16_t unsigned __int16 +# define munit_int32_t __int32 +# define munit_uint32_t unsigned __int32 +# define munit_int64_t __int64 +# define munit_uint64_t unsigned __int64 +#else +# include +# define munit_int8_t int8_t +# define munit_uint8_t uint8_t +# define munit_int16_t int16_t +# define munit_uint16_t uint16_t +# define munit_int32_t int32_t +# define munit_uint32_t uint32_t +# define munit_int64_t int64_t +# define munit_uint64_t uint64_t +#endif + +#if defined(_MSC_VER) && (_MSC_VER < 1800) +# if !defined(PRIi8) +# define PRIi8 "i" +# endif +# if !defined(PRIi16) +# define PRIi16 "i" +# endif +# if !defined(PRIi32) +# define PRIi32 "i" +# endif +# if !defined(PRIi64) +# define PRIi64 "I64i" +# endif +# if !defined(PRId8) +# define PRId8 "d" +# endif +# if !defined(PRId16) +# define PRId16 "d" +# endif +# if !defined(PRId32) +# define PRId32 "d" +# endif +# if !defined(PRId64) +# define PRId64 "I64d" +# endif +# if !defined(PRIx8) +# define PRIx8 "x" +# endif +# if !defined(PRIx16) +# define PRIx16 "x" +# endif +# if !defined(PRIx32) +# define PRIx32 "x" +# endif +# if !defined(PRIx64) +# define PRIx64 "I64x" +# endif +# if !defined(PRIu8) +# define PRIu8 "u" +# endif +# if !defined(PRIu16) +# define PRIu16 "u" +# endif +# if !defined(PRIu32) +# define PRIu32 "u" +# endif +# if !defined(PRIu64) +# define PRIu64 "I64u" +# endif +# if !defined(bool) +# define bool int +# endif +# if !defined(true) +# define true (!0) +# endif +# if !defined(false) +# define false (!!0) +# endif +#else +# include +# include +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__GNUC__) +# define MUNIT_LIKELY(expr) (__builtin_expect ((expr), 1)) +# define MUNIT_UNLIKELY(expr) (__builtin_expect ((expr), 0)) +# define MUNIT_UNUSED __attribute__((__unused__)) +#else +# define MUNIT_LIKELY(expr) (expr) +# define MUNIT_UNLIKELY(expr) (expr) +# define MUNIT_UNUSED +#endif + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__PGI) +# define MUNIT_ARRAY_PARAM(name) name +#else +# define MUNIT_ARRAY_PARAM(name) +#endif + +#if !defined(_WIN32) +# define MUNIT_SIZE_MODIFIER "z" +# define MUNIT_CHAR_MODIFIER "hh" +# define MUNIT_SHORT_MODIFIER "h" +#else +# if defined(_M_X64) || defined(__amd64__) +# define MUNIT_SIZE_MODIFIER "I64" +# else +# define MUNIT_SIZE_MODIFIER "" +# endif +# define MUNIT_CHAR_MODIFIER "" +# define MUNIT_SHORT_MODIFIER "" +#endif + +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +# define MUNIT_NO_RETURN _Noreturn +#elif defined(__GNUC__) +# define MUNIT_NO_RETURN __attribute__((__noreturn__)) +#elif defined(_MSC_VER) +# define MUNIT_NO_RETURN __declspec(noreturn) +#else +# define MUNIT_NO_RETURN +#endif + +#if defined(_MSC_VER) && (_MSC_VER >= 1500) +# define MUNIT__PUSH_DISABLE_MSVC_C4127 __pragma(warning(push)) __pragma(warning(disable:4127)) +# define MUNIT__POP_DISABLE_MSVC_C4127 __pragma(warning(pop)) +#else +# define MUNIT__PUSH_DISABLE_MSVC_C4127 +# define MUNIT__POP_DISABLE_MSVC_C4127 +#endif + +typedef enum { + MUNIT_LOG_DEBUG, + MUNIT_LOG_INFO, + MUNIT_LOG_WARNING, + MUNIT_LOG_ERROR +} MunitLogLevel; + +#if defined(__GNUC__) && !defined(__MINGW32__) +# define MUNIT_PRINTF(string_index, first_to_check) __attribute__((format (printf, string_index, first_to_check))) +#else +# define MUNIT_PRINTF(string_index, first_to_check) +#endif + +MUNIT_PRINTF(4, 5) +void munit_logf_ex(MunitLogLevel level, const char* filename, int line, const char* format, ...); + +#define munit_logf(level, format, ...) \ + munit_logf_ex(level, __FILE__, __LINE__, format, __VA_ARGS__) + +#define munit_log(level, msg) \ + munit_logf(level, "%s", msg) + +MUNIT_NO_RETURN +MUNIT_PRINTF(3, 4) +void munit_errorf_ex(const char* filename, int line, const char* format, ...); + +#define munit_errorf(format, ...) \ + munit_errorf_ex(__FILE__, __LINE__, format, __VA_ARGS__) + +#define munit_error(msg) \ + munit_errorf("%s", msg) + +#define munit_assert(expr) \ + do { \ + if (!MUNIT_LIKELY(expr)) { \ + munit_error("assertion failed: " #expr); \ + } \ + MUNIT__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + MUNIT__POP_DISABLE_MSVC_C4127 + +#define munit_assert_true(expr) \ + do { \ + if (!MUNIT_LIKELY(expr)) { \ + munit_error("assertion failed: " #expr " is not true"); \ + } \ + MUNIT__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + MUNIT__POP_DISABLE_MSVC_C4127 + +#define munit_assert_false(expr) \ + do { \ + if (!MUNIT_LIKELY(!(expr))) { \ + munit_error("assertion failed: " #expr " is not false"); \ + } \ + MUNIT__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + MUNIT__POP_DISABLE_MSVC_C4127 + +#define munit_assert_type_full(prefix, suffix, T, fmt, a, op, b) \ + do { \ + T munit_tmp_a_ = (a); \ + T munit_tmp_b_ = (b); \ + if (!(munit_tmp_a_ op munit_tmp_b_)) { \ + munit_errorf("assertion failed: %s %s %s (" prefix "%" fmt suffix " %s " prefix "%" fmt suffix ")", \ + #a, #op, #b, munit_tmp_a_, #op, munit_tmp_b_); \ + } \ + MUNIT__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + MUNIT__POP_DISABLE_MSVC_C4127 + +#define munit_assert_type(T, fmt, a, op, b) \ + munit_assert_type_full("", "", T, fmt, a, op, b) + +#define munit_assert_char(a, op, b) \ + munit_assert_type_full("'\\x", "'", char, "02" MUNIT_CHAR_MODIFIER "x", a, op, b) +#define munit_assert_uchar(a, op, b) \ + munit_assert_type_full("'\\x", "'", unsigned char, "02" MUNIT_CHAR_MODIFIER "x", a, op, b) +#define munit_assert_short(a, op, b) \ + munit_assert_type(short, MUNIT_SHORT_MODIFIER "d", a, op, b) +#define munit_assert_ushort(a, op, b) \ + munit_assert_type(unsigned short, MUNIT_SHORT_MODIFIER "u", a, op, b) +#define munit_assert_int(a, op, b) \ + munit_assert_type(int, "d", a, op, b) +#define munit_assert_uint(a, op, b) \ + munit_assert_type(unsigned int, "u", a, op, b) +#define munit_assert_long(a, op, b) \ + munit_assert_type(long int, "ld", a, op, b) +#define munit_assert_ulong(a, op, b) \ + munit_assert_type(unsigned long int, "lu", a, op, b) +#define munit_assert_llong(a, op, b) \ + munit_assert_type(long long int, "lld", a, op, b) +#define munit_assert_ullong(a, op, b) \ + munit_assert_type(unsigned long long int, "llu", a, op, b) + +#define munit_assert_size(a, op, b) \ + munit_assert_type(size_t, MUNIT_SIZE_MODIFIER "u", a, op, b) + +#define munit_assert_float(a, op, b) \ + munit_assert_type(float, "f", a, op, b) +#define munit_assert_double(a, op, b) \ + munit_assert_type(double, "g", a, op, b) +#define munit_assert_ptr(a, op, b) \ + munit_assert_type(const void*, "p", a, op, b) + +#define munit_assert_int8(a, op, b) \ + munit_assert_type(munit_int8_t, PRIi8, a, op, b) +#define munit_assert_uint8(a, op, b) \ + munit_assert_type(munit_uint8_t, PRIu8, a, op, b) +#define munit_assert_int16(a, op, b) \ + munit_assert_type(munit_int16_t, PRIi16, a, op, b) +#define munit_assert_uint16(a, op, b) \ + munit_assert_type(munit_uint16_t, PRIu16, a, op, b) +#define munit_assert_int32(a, op, b) \ + munit_assert_type(munit_int32_t, PRIi32, a, op, b) +#define munit_assert_uint32(a, op, b) \ + munit_assert_type(munit_uint32_t, PRIu32, a, op, b) +#define munit_assert_int64(a, op, b) \ + munit_assert_type(munit_int64_t, PRIi64, a, op, b) +#define munit_assert_uint64(a, op, b) \ + munit_assert_type(munit_uint64_t, PRIu64, a, op, b) + +#define munit_assert_double_equal(a, b, precision) \ + do { \ + const double munit_tmp_a_ = (a); \ + const double munit_tmp_b_ = (b); \ + const double munit_tmp_diff_ = ((munit_tmp_a_ - munit_tmp_b_) < 0) ? \ + -(munit_tmp_a_ - munit_tmp_b_) : \ + (munit_tmp_a_ - munit_tmp_b_); \ + if (MUNIT_UNLIKELY(munit_tmp_diff_ > 1e-##precision)) { \ + munit_errorf("assertion failed: %s == %s (%0." #precision "g == %0." #precision "g)", \ + #a, #b, munit_tmp_a_, munit_tmp_b_); \ + } \ + MUNIT__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + MUNIT__POP_DISABLE_MSVC_C4127 + +#include +#define munit_assert_string_equal(a, b) \ + do { \ + const char* munit_tmp_a_ = a; \ + const char* munit_tmp_b_ = b; \ + if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) != 0)) { \ + munit_errorf("assertion failed: string %s == %s (\"%s\" == \"%s\")", \ + #a, #b, munit_tmp_a_, munit_tmp_b_); \ + } \ + MUNIT__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + MUNIT__POP_DISABLE_MSVC_C4127 + +#define munit_assert_string_not_equal(a, b) \ + do { \ + const char* munit_tmp_a_ = a; \ + const char* munit_tmp_b_ = b; \ + if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) == 0)) { \ + munit_errorf("assertion failed: string %s != %s (\"%s\" == \"%s\")", \ + #a, #b, munit_tmp_a_, munit_tmp_b_); \ + } \ + MUNIT__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + MUNIT__POP_DISABLE_MSVC_C4127 + +#define munit_assert_memory_equal(size, a, b) \ + do { \ + const unsigned char* munit_tmp_a_ = (const unsigned char*) (a); \ + const unsigned char* munit_tmp_b_ = (const unsigned char*) (b); \ + const size_t munit_tmp_size_ = (size); \ + if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) != 0) { \ + size_t munit_tmp_pos_; \ + for (munit_tmp_pos_ = 0 ; munit_tmp_pos_ < munit_tmp_size_ ; munit_tmp_pos_++) { \ + if (munit_tmp_a_[munit_tmp_pos_] != munit_tmp_b_[munit_tmp_pos_]) { \ + munit_errorf("assertion failed: memory %s == %s, at offset %" MUNIT_SIZE_MODIFIER "u", \ + #a, #b, munit_tmp_pos_); \ + break; \ + } \ + } \ + } \ + MUNIT__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + MUNIT__POP_DISABLE_MSVC_C4127 + +#define munit_assert_memory_not_equal(size, a, b) \ + do { \ + const unsigned char* munit_tmp_a_ = (const unsigned char*) (a); \ + const unsigned char* munit_tmp_b_ = (const unsigned char*) (b); \ + const size_t munit_tmp_size_ = (size); \ + if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) == 0) { \ + munit_errorf("assertion failed: memory %s != %s (%zu bytes)", \ + #a, #b, munit_tmp_size_); \ + } \ + MUNIT__PUSH_DISABLE_MSVC_C4127 \ + } while (0) \ + MUNIT__POP_DISABLE_MSVC_C4127 + +#define munit_assert_ptr_equal(a, b) \ + munit_assert_ptr(a, ==, b) +#define munit_assert_ptr_not_equal(a, b) \ + munit_assert_ptr(a, !=, b) +#define munit_assert_null(ptr) \ + munit_assert_ptr(ptr, ==, NULL) +#define munit_assert_not_null(ptr) \ + munit_assert_ptr(ptr, !=, NULL) +#define munit_assert_ptr_null(ptr) \ + munit_assert_ptr(ptr, ==, NULL) +#define munit_assert_ptr_not_null(ptr) \ + munit_assert_ptr(ptr, !=, NULL) + +/*** Memory allocation ***/ + +void* munit_malloc_ex(const char* filename, int line, size_t size); + +#define munit_malloc(size) \ + munit_malloc_ex(__FILE__, __LINE__, (size)) + +#define munit_new(type) \ + ((type*) munit_malloc(sizeof(type))) + +#define munit_calloc(nmemb, size) \ + munit_malloc((nmemb) * (size)) + +#define munit_newa(type, nmemb) \ + ((type*) munit_calloc((nmemb), sizeof(type))) + +/*** Random number generation ***/ + +void munit_rand_seed(munit_uint32_t seed); +munit_uint32_t munit_rand_uint32(void); +int munit_rand_int_range(int min, int max); +double munit_rand_double(void); +void munit_rand_memory(size_t size, munit_uint8_t buffer[MUNIT_ARRAY_PARAM(size)]); + +/*** Tests and Suites ***/ + +typedef enum { + /* Test successful */ + MUNIT_OK, + /* Test failed */ + MUNIT_FAIL, + /* Test was skipped */ + MUNIT_SKIP, + /* Test failed due to circumstances not intended to be tested + * (things like network errors, invalid parameter value, failure to + * allocate memory in the test harness, etc.). */ + MUNIT_ERROR +} MunitResult; + +typedef struct { + char* name; + char** values; +} MunitParameterEnum; + +typedef struct { + char* name; + char* value; +} MunitParameter; + +const char* munit_parameters_get(const MunitParameter params[], const char* key); + +typedef enum { + MUNIT_TEST_OPTION_NONE = 0, + MUNIT_TEST_OPTION_SINGLE_ITERATION = 1 << 0, + MUNIT_TEST_OPTION_TODO = 1 << 1 +} MunitTestOptions; + +typedef MunitResult (* MunitTestFunc)(const MunitParameter params[], void* user_data_or_fixture); +typedef void* (* MunitTestSetup)(const MunitParameter params[], void* user_data); +typedef void (* MunitTestTearDown)(void* fixture); + +typedef struct { + char* name; + MunitTestFunc test; + MunitTestSetup setup; + MunitTestTearDown tear_down; + MunitTestOptions options; + MunitParameterEnum* parameters; +} MunitTest; + +typedef enum { + MUNIT_SUITE_OPTION_NONE = 0 +} MunitSuiteOptions; + +typedef struct MunitSuite_ MunitSuite; + +struct MunitSuite_ { + char* prefix; + MunitTest* tests; + MunitSuite* suites; + unsigned int iterations; + MunitSuiteOptions options; +}; + +int munit_suite_main(const MunitSuite* suite, void* user_data, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]); + +/* Note: I'm not very happy with this API; it's likely to change if I + * figure out something better. Suggestions welcome. */ + +typedef struct MunitArgument_ MunitArgument; + +struct MunitArgument_ { + char* name; + bool (* parse_argument)(const MunitSuite* suite, void* user_data, int* arg, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]); + void (* write_help)(const MunitArgument* argument, void* user_data); +}; + +int munit_suite_main_custom(const MunitSuite* suite, + void* user_data, + int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)], + const MunitArgument arguments[]); + +#if defined(MUNIT_ENABLE_ASSERT_ALIASES) + +#define assert_true(expr) munit_assert_true(expr) +#define assert_false(expr) munit_assert_false(expr) +#define assert_char(a, op, b) munit_assert_char(a, op, b) +#define assert_uchar(a, op, b) munit_assert_uchar(a, op, b) +#define assert_short(a, op, b) munit_assert_short(a, op, b) +#define assert_ushort(a, op, b) munit_assert_ushort(a, op, b) +#define assert_int(a, op, b) munit_assert_int(a, op, b) +#define assert_uint(a, op, b) munit_assert_uint(a, op, b) +#define assert_long(a, op, b) munit_assert_long(a, op, b) +#define assert_ulong(a, op, b) munit_assert_ulong(a, op, b) +#define assert_llong(a, op, b) munit_assert_llong(a, op, b) +#define assert_ullong(a, op, b) munit_assert_ullong(a, op, b) +#define assert_size(a, op, b) munit_assert_size(a, op, b) +#define assert_float(a, op, b) munit_assert_float(a, op, b) +#define assert_double(a, op, b) munit_assert_double(a, op, b) +#define assert_ptr(a, op, b) munit_assert_ptr(a, op, b) + +#define assert_int8(a, op, b) munit_assert_int8(a, op, b) +#define assert_uint8(a, op, b) munit_assert_uint8(a, op, b) +#define assert_int16(a, op, b) munit_assert_int16(a, op, b) +#define assert_uint16(a, op, b) munit_assert_uint16(a, op, b) +#define assert_int32(a, op, b) munit_assert_int32(a, op, b) +#define assert_uint32(a, op, b) munit_assert_uint32(a, op, b) +#define assert_int64(a, op, b) munit_assert_int64(a, op, b) +#define assert_uint64(a, op, b) munit_assert_uint64(a, op, b) + +#define assert_double_equal(a, b, precision) munit_assert_double_equal(a, b, precision) +#define assert_string_equal(a, b) munit_assert_string_equal(a, b) +#define assert_string_not_equal(a, b) munit_assert_string_not_equal(a, b) +#define assert_memory_equal(size, a, b) munit_assert_memory_equal(size, a, b) +#define assert_memory_not_equal(size, a, b) munit_assert_memory_not_equal(size, a, b) +#define assert_ptr_equal(a, b) munit_assert_ptr_equal(a, b) +#define assert_ptr_not_equal(a, b) munit_assert_ptr_not_equal(a, b) +#define assert_ptr_null(ptr) munit_assert_null_equal(ptr) +#define assert_ptr_not_null(ptr) munit_assert_not_null(ptr) + +#define assert_null(ptr) munit_assert_null(ptr) +#define assert_not_null(ptr) munit_assert_not_null(ptr) + +#endif /* defined(MUNIT_ENABLE_ASSERT_ALIASES) */ + +#if defined(__cplusplus) +} +#endif + +#endif /* !defined(MUNIT_H) */ + +#if defined(MUNIT_ENABLE_ASSERT_ALIASES) +# if defined(assert) +# undef assert +# endif +# define assert(expr) munit_assert(expr) +#endif diff --git a/test/raft/lib/runner.h b/test/raft/lib/runner.h new file mode 100644 index 000000000..13244a33a --- /dev/null +++ b/test/raft/lib/runner.h @@ -0,0 +1,113 @@ +/* Convenience macros to reduce munit boiler plate. */ + +#ifndef TEST_RUNNER_H_ +#define TEST_RUNNER_H_ + +#include "munit.h" + +/* Top-level suites array declaration. + * + * These top-level suites hold all module-level child suites and must be defined + * and then set as child suites of a root suite created at runtime by the test + * runner's main(). This can be done using the TEST_RUNNER macro. */ +extern MunitSuite _main_suites[]; +extern int _main_suites_n; + +/* Maximum number of test cases for each suite */ +#define SUITE__CAP 128 + +/* Define the top-level suites array and the main() function of the test. */ +#define RUNNER(NAME) \ + MunitSuite _main_suites[SUITE__CAP]; \ + int _main_suites_n = 0; \ + \ + int main(int argc, char *argv[MUNIT_ARRAY_PARAM(argc)]) \ + { \ + MunitSuite suite = {(char *)"", NULL, _main_suites, 1, 0}; \ + return munit_suite_main(&suite, (void *)NAME, argc, argv); \ + } + +/* Declare and register a new test suite #S belonging to the file's test module. + * + * A test suite is a pair of static variables: + * + * static MunitTest _##S##_suites[SUITE__CAP] + * static MunitTest _##S##_tests[SUITE__CAP] + * + * The tests and suites attributes of the next available MunitSuite slot in the + * _module_suites array will be set to the suite's tests and suites arrays, and + * the prefix attribute of the slot will be set to /S. */ +#define SUITE(S) \ + SUITE__DECLARE(S) \ + SUITE__ADD_CHILD(main, #S, S) + +/* Declare and register a new test. */ +#define TEST(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS) \ + static MunitResult test_##S##_##C(const MunitParameter params[], \ + void *data); \ + TEST__ADD_TO_SUITE(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS) \ + static MunitResult test_##S##_##C( \ + MUNIT_UNUSED const MunitParameter params[], MUNIT_UNUSED void *data) + +#define SKIP_IF_NO_FIXTURE \ + if (f == NULL) { \ + return MUNIT_SKIP; \ + } + +/* Declare the MunitSuite[] and the MunitTest[] arrays that compose the test + * suite identified by S. */ +#define SUITE__DECLARE(S) \ + static MunitSuite _##S##_suites[SUITE__CAP]; \ + static MunitTest _##S##_tests[SUITE__CAP]; \ + static MunitTestSetup _##S##_setup = NULL; \ + static MunitTestTearDown _##S##_tear_down = NULL; \ + static int _##S##_suites_n = 0; \ + static int _##S##_tests_n = 0; \ + __attribute__((constructor(101))) static void _##S##_init(void) \ + { \ + memset(_##S##_suites, 0, sizeof(_##S##_suites)); \ + memset(_##S##_tests, 0, sizeof(_##S##_tests)); \ + (void)_##S##_suites_n; \ + (void)_##S##_tests_n; \ + (void)_##S##_setup; \ + (void)_##S##_tear_down; \ + } + +/* Set the tests and suites attributes of the next available slot of the + * MunitSuite[] array of S1 to the MunitTest[] and MunitSuite[] arrays of S2, + * using the given PREFIX. */ +#define SUITE__ADD_CHILD(S1, PREFIX, S2) \ + __attribute__((constructor(102))) static void _##S1##_##S2##_init(void) \ + { \ + int n = _##S1##_suites_n; \ + _##S1##_suites[n].prefix = PREFIX; \ + _##S1##_suites[n].tests = _##S2##_tests; \ + _##S1##_suites[n].suites = _##S2##_suites; \ + _##S1##_suites[n].iterations = 0; \ + _##S1##_suites[n].options = 0; \ + _##S1##_suites_n = n + 1; \ + } + +/* Add a test case to the MunitTest[] array of suite S. */ +#define TEST__ADD_TO_SUITE(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS) \ + __attribute__((constructor(103))) static void _##S##_tests_##C##_init( \ + void) \ + { \ + MunitTest *tests = _##S##_tests; \ + int n = _##S##_tests_n; \ + TEST__SET_IN_ARRAY(tests, n, "/" #C, test_##S##_##C, SETUP, TEAR_DOWN, \ + OPTIONS, PARAMS); \ + _##S##_tests_n = n + 1; \ + } + +/* Set the values of the I'th test case slot in the given test array */ +#define TEST__SET_IN_ARRAY(TESTS, I, NAME, FUNC, SETUP, TEAR_DOWN, OPTIONS, \ + PARAMS) \ + TESTS[I].name = NAME; \ + TESTS[I].test = FUNC; \ + TESTS[I].setup = SETUP; \ + TESTS[I].tear_down = TEAR_DOWN; \ + TESTS[I].options = OPTIONS; \ + TESTS[I].parameters = PARAMS + +#endif /* TEST_RUNNER_H_ */ diff --git a/test/raft/lib/snapshot.h b/test/raft/lib/snapshot.h new file mode 100644 index 000000000..4a6e8af10 --- /dev/null +++ b/test/raft/lib/snapshot.h @@ -0,0 +1,26 @@ +/** + * Raft snapshot test helpers. + */ + +#ifndef TEST_SNAPSHOT_H +#define TEST_SNAPSHOT_H + +#include "../../../src/raft.h" + +#include "../../../src/raft/configuration.h" + +/** + * Allocate and create the given snapshot, using the given @LAST_INDEX, + * @LAST_TERM, the given @CONF, and generating an FSM snapshot using @X and @Y. + */ +#define CREATE_SNAPSHOT(SNAPSHOT, LAST_INDEX, LAST_TERM, CONF, CONF_INDEX, X, \ + Y) \ + SNAPSHOT = raft_malloc(sizeof *SNAPSHOT); \ + munit_assert_ptr_not_null(SNAPSHOT); \ + SNAPSHOT->index = LAST_INDEX; \ + SNAPSHOT->term = LAST_TERM; \ + SNAPSHOT->configuration = CONF; \ + SNAPSHOT->configuration_index = CONF_INDEX; \ + FsmEncodeSnapshot(X, Y, &SNAPSHOT->bufs, &SNAPSHOT->n_bufs) + +#endif /* TEST_CONFIGURATION_H */ diff --git a/test/raft/lib/tcp.c b/test/raft/lib/tcp.c new file mode 100644 index 000000000..02b305739 --- /dev/null +++ b/test/raft/lib/tcp.c @@ -0,0 +1,236 @@ +#include "tcp.h" + +#include +#include +#include +#include + +void TcpServerInit(struct TcpServer *s) +{ + struct sockaddr_in addr; + socklen_t size = sizeof addr; + int rv; + + /* Initialize the socket address structure. */ + memset(&addr, 0, size); + + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + addr.sin_port = 0; /* Get a random free port */ + + /* Create the server socket. */ + s->socket = socket(AF_INET, SOCK_STREAM, 0); + if (s->socket == -1) { + munit_errorf("tcp server: socket(): %s", strerror(errno)); + } + + /* Bind the socket. */ + rv = bind(s->socket, (struct sockaddr *)&addr, size); + if (rv == -1) { + munit_errorf("tcp server: bind(): %s", strerror(errno)); + } + + /* Start listening. */ + rv = listen(s->socket, 1); + if (rv == -1) { + munit_errorf("tcp server: listen(): %s", strerror(errno)); + } + + /* Get the actual addressed assigned by the kernel and save it back in the + * relevant field. */ + rv = getsockname(s->socket, (struct sockaddr *)&addr, &size); + if (rv != 0) { + munit_errorf("tcp: getsockname(): %s", strerror(errno)); + } + + s->port = htons(addr.sin_port); + sprintf(s->address, "127.0.0.1:%d", s->port); +} + +void TcpServerClose(struct TcpServer *s) +{ + int rv; + + if (s->socket == -1) { + return; + } + + rv = close(s->socket); + if (rv == -1) { + munit_errorf("tcp server: close(): %s", strerror(errno)); + } +} + +int TcpServerAccept(struct TcpServer *s) +{ + int socket; + struct sockaddr_in address; + socklen_t size; + + size = sizeof(address); + + socket = accept(s->socket, (struct sockaddr *)&address, &size); + if (socket < 0) { + munit_errorf("tcp server: accept(): %s", strerror(errno)); + } + + return socket; +} + +void TcpServerStop(struct TcpServer *s) +{ + int rv; + + rv = close(s->socket); + if (rv == -1) { + munit_errorf("tcp server: close(): %s", strerror(errno)); + } + s->socket = -1; +} + +void test_tcp_setup(const MunitParameter params[], struct test_tcp *t) +{ + (void)params; + t->server.socket = -1; + t->client.socket = -1; +} + +void test_tcp_tear_down(struct test_tcp *t) +{ + int rv; + + if (t->server.socket != -1) { + rv = close(t->server.socket); + if (rv == -1) { + munit_errorf("tcp: close(): %s", strerror(errno)); + } + } + + if (t->client.socket != -1) { + rv = close(t->client.socket); + if (rv == -1) { + munit_errorf("tcp: close(): %s", strerror(errno)); + } + } +} + +void test_tcp_listen(struct test_tcp *t) +{ + struct sockaddr_in addr; + socklen_t size = sizeof addr; + int rv; + + /* Initialize the socket address structure. */ + memset(&addr, 0, size); + + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + addr.sin_port = 0; /* Get a random free port */ + + /* Create the server socket. */ + t->server.socket = socket(AF_INET, SOCK_STREAM, 0); + if (t->server.socket == -1) { + munit_errorf("tcp: socket(): %s", strerror(errno)); + } + + /* Bind the socket. */ + rv = bind(t->server.socket, (struct sockaddr *)&addr, size); + if (rv == -1) { + munit_errorf("tcp: bind(): %s", strerror(errno)); + } + + /* Start listening. */ + rv = listen(t->server.socket, 1); + if (rv == -1) { + munit_errorf("tcp: listen(): %s", strerror(errno)); + } + + /* Get the actual addressed assigned by the kernel and save it back in + * the relevant test_socket__server field (pointed to by address). */ + rv = getsockname(t->server.socket, (struct sockaddr *)&addr, &size); + if (rv != 0) { + munit_errorf("tcp: getsockname(): %s", strerror(errno)); + } + + sprintf(t->server.address, "127.0.0.1:%d", htons(addr.sin_port)); +} + +const char *test_tcp_address(struct test_tcp *t) +{ + return t->server.address; +} + +void test_tcp_connect(struct test_tcp *t, int port) +{ + struct sockaddr_in addr; + int rv; + + /* Create the client socket. */ + t->client.socket = socket(AF_INET, SOCK_STREAM, 0); + if (t->client.socket == -1) { + munit_errorf("tcp: socket(): %s", strerror(errno)); + } + + /* Initialize the socket address structure. */ + memset(&addr, 0, sizeof addr); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + addr.sin_port = htons(port); + + /* Connect */ + rv = connect(t->client.socket, (struct sockaddr *)&addr, sizeof addr); + if (rv == -1) { + munit_errorf("tcp: connect(): %s", strerror(errno)); + } +} + +void test_tcp_close(struct test_tcp *t) +{ + int rv; + + rv = close(t->client.socket); + if (rv == -1) { + munit_errorf("tcp: close(): %s", strerror(errno)); + } + t->client.socket = -1; +} + +void test_tcp_stop(struct test_tcp *t) +{ + int rv; + + rv = close(t->server.socket); + if (rv == -1) { + munit_errorf("tcp: close(): %s", strerror(errno)); + } + t->server.socket = -1; +} + +void test_tcp_send(struct test_tcp *t, const void *buf, int len) +{ + int rv; + + rv = write(t->client.socket, buf, len); + if (rv == -1) { + munit_errorf("tcp: write(): %s", strerror(errno)); + } + if (rv != len) { + munit_errorf("tcp: write(): only %d bytes written", rv); + } +} + +int test_tcp_accept(struct test_tcp *t) +{ + int socket; + struct sockaddr_in address; + socklen_t size; + + size = sizeof(address); + + socket = accept(t->server.socket, (struct sockaddr *)&address, &size); + if (socket < 0) { + munit_errorf("tcp: accept(): %s", strerror(errno)); + } + + return socket; +} diff --git a/test/raft/lib/tcp.h b/test/raft/lib/tcp.h new file mode 100644 index 000000000..c84b2241d --- /dev/null +++ b/test/raft/lib/tcp.h @@ -0,0 +1,110 @@ +/* Test TCP utilities. + * + * This module sports helpers to create server or client sockets, and + * send/receive data through them. + */ + +#ifndef TEST_TCP_H +#define TEST_TCP_H + +#include "munit.h" + +/* Macro helpers. */ +#define FIXTURE_TCP_SERVER struct TcpServer server +#define SETUP_TCP_SERVER TcpServerInit(&f->server) +#define TEAR_DOWN_TCP_SERVER TcpServerClose(&f->server) + +#define TCP_SERVER_STOP TcpServerStop(&f->server) +#define TCP_SERVER_PORT f->server.port +#define TCP_SERVER_ADDRESS f->server.address + +#define FIXTURE_TCP struct test_tcp tcp +#define SETUP_TCP test_tcp_setup(params, &f->tcp) +#define TEAR_DOWN_TCP test_tcp_tear_down(&f->tcp) + +#define TCP_CLIENT_CONNECT(PORT) test_tcp_connect(&f->tcp, PORT) +#define TCP_CLIENT_SEND(BUF, N) test_tcp_send(&f->tcp, BUF, N) +#define TCP_CLIENT_CLOSE test_tcp_close(&f->tcp) + +struct TcpServer +{ + int socket; /* Socket listening to incoming connections */ + int port; + char address[128]; /* IPv4 address of the server, with port */ +}; + +void TcpServerInit(struct TcpServer *s); +void TcpServerClose(struct TcpServer *s); + +/* Accept inbound client connection and return the relevant socket. */ +int TcpServerAccept(struct TcpServer *s); + +/* Close the server socket. */ +void TcpServerStop(struct TcpServer *s); + +struct TcpClient +{ + int socket; /* Socket connected to a server. */ +}; + +void TcpClientInit(struct TcpClient *s); +void TcpClientClose(struct TcpClient *s); + +/* Object that can be used to setup and control a TCP server and/or client. */ +struct test_tcp +{ + struct + { + int socket; /* Socket listening to incoming connections */ + char address[128]; /* IPv4 address of the server, with port */ + } server; + struct + { + int socket; /* Socket connected to another host */ + } client; +}; + +/** + * Bind the server socket of the given test TCP host to localhost and start + * listening to it. + */ +void test_tcp_setup(const MunitParameter params[], struct test_tcp *t); + +void test_tcp_tear_down(struct test_tcp *t); + +/** + * Start listening to a random free port on localhost. + */ +void test_tcp_listen(struct test_tcp *t); + +/** + * Return the address of the server socket created with @test_tcp_listen. + */ +const char *test_tcp_address(struct test_tcp *t); + +/** + * Connect the client socket to the given port on localhost. + */ +void test_tcp_connect(struct test_tcp *t, int port); + +/** + * Close the client socket. + */ +void test_tcp_close(struct test_tcp *t); + +/** + * Send data using the client socket. + */ +void test_tcp_send(struct test_tcp *t, const void *buf, int len); + +/** + * Accept inbound client connection and return the relevant socket. + */ +int test_tcp_accept(struct test_tcp *t); + +/** + * Close the server socket. + */ +void test_tcp_stop(struct test_tcp *t); + +#endif /* TEST_TCP_H */ diff --git a/test/raft/lib/uv.h b/test/raft/lib/uv.h new file mode 100644 index 000000000..7fdcdd08b --- /dev/null +++ b/test/raft/lib/uv.h @@ -0,0 +1,64 @@ +/* Helpers around the libuv-based implementation of the raft_io interface. */ + +#ifndef TEST_UV_H +#define TEST_UV_H + +#include "../../../src/raft.h" +#include "dir.h" +#include "heap.h" +#include "loop.h" + +#define FIXTURE_UV_TRANSPORT struct raft_uv_transport transport +#define SETUP_UV_TRANSPORT \ + do { \ + int rv_; \ + f->transport.version = 1; \ + rv_ = raft_uv_tcp_init(&f->transport, &f->loop); \ + munit_assert_int(rv_, ==, 0); \ + } while (0) +#define TEAR_DOWN_UV_TRANSPORT raft_uv_tcp_close(&f->transport) + +#define FIXTURE_UV_DEPS \ + FIXTURE_DIR; \ + FIXTURE_HEAP; \ + FIXTURE_LOOP; \ + FIXTURE_UV_TRANSPORT +#define SETUP_UV_DEPS \ + SET_UP_DIR; \ + SET_UP_HEAP; \ + SETUP_LOOP; \ + SETUP_UV_TRANSPORT +#define TEAR_DOWN_UV_DEPS \ + TEAR_DOWN_UV_TRANSPORT; \ + TEAR_DOWN_LOOP; \ + TEAR_DOWN_HEAP; \ + TEAR_DOWN_DIR + +#define FIXTURE_UV struct raft_io io + +#define SETUP_UV \ + do { \ + int rv_; \ + rv_ = raft_uv_init(&f->io, &f->loop, f->dir, &f->transport); \ + munit_assert_int(rv_, ==, 0); \ + raft_uv_set_auto_recovery(&f->io, false); \ + rv_ = f->io.init(&f->io, 1, "127.0.0.1:9001"); \ + munit_assert_int(rv_, ==, 0); \ + } while (0) + +MUNIT_UNUSED static void uvCloseCb(struct raft_io *io) +{ + bool *closed = io->data; + *closed = true; +} + +#define TEAR_DOWN_UV \ + do { \ + bool _closed = false; \ + f->io.data = &_closed; \ + f->io.close(&f->io, uvCloseCb); \ + LOOP_RUN_UNTIL(&_closed); \ + raft_uv_close(&f->io); \ + } while (0) + +#endif /* TEST_UV_H */ diff --git a/test/raft/unit/main_core.c b/test/raft/unit/main_core.c new file mode 100644 index 000000000..ad1798bba --- /dev/null +++ b/test/raft/unit/main_core.c @@ -0,0 +1,3 @@ +#include "../lib/runner.h" + +RUNNER("core") diff --git a/test/raft/unit/main_uv.c b/test/raft/unit/main_uv.c new file mode 100644 index 000000000..7f2eba543 --- /dev/null +++ b/test/raft/unit/main_uv.c @@ -0,0 +1,3 @@ +#include "../lib/runner.h" + +RUNNER("uv") diff --git a/test/raft/unit/test_byte.c b/test/raft/unit/test_byte.c new file mode 100644 index 000000000..2ad2dd485 --- /dev/null +++ b/test/raft/unit/test_byte.c @@ -0,0 +1,179 @@ +#include +#include + +#include "../../../src/raft/byte.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +#define CRC32(VALUE) byteCrc32(&(VALUE), sizeof VALUE, 0) + +/****************************************************************************** + * + * byteCrc32 + * + *****************************************************************************/ + +SUITE(byteCrc32) + +/* The same data produces the same sum. */ +TEST(byteCrc32, valid, NULL, NULL, 0, NULL) +{ + uint64_t value1 = 123456789; + uint64_t value2 = 123456789; + munit_assert_int(CRC32(value1), ==, CRC32(value2)); + return MUNIT_OK; +} + +/* Different data produces a different sum. */ +TEST(byteCrc32, invalid, NULL, NULL, 0, NULL) +{ + uint64_t value1 = 123456789; + uint64_t value2 = 123466789; + munit_assert_int(CRC32(value1), !=, CRC32(value2)); + return MUNIT_OK; +} + +/****************************************************************************** + * + * Convert to little endian representation (least significant byte first). + * + *****************************************************************************/ + +SUITE(byteFlip) + +/* Convert a 32-bit number. */ +TEST(byteFlip, 32, NULL, NULL, 0, NULL) +{ + uint32_t value; + unsigned i; + value = byteFlip32(0x03020100); + for (i = 0; i < 4; i++) { + munit_assert_int(*((uint8_t *)&value + i), ==, i); + } + return MUNIT_OK; +} + +/* Convert a 64-bit number. */ +TEST(byteFlip, 64, NULL, NULL, 0, NULL) +{ + uint64_t value; + unsigned i; + value = byteFlip64(0x0706050403020100); + for (i = 0; i < 8; i++) { + munit_assert_int(*((uint8_t *)&value + i), ==, i); + } + return MUNIT_OK; +} + +/****************************************************************************** + * + * byteGetString + * + *****************************************************************************/ + +SUITE(byteGetString) + +TEST(byteGetString, success, NULL, NULL, 0, NULL) +{ + uint8_t buf[] = {'h', 'e', 'l', 'l', 'o', 0}; + const void *cursor = buf; + munit_assert_string_equal(byteGetString(&cursor, sizeof buf), "hello"); + munit_assert_ptr_equal(cursor, buf + sizeof buf); + return MUNIT_OK; +} + +TEST(byteGetString, malformed, NULL, NULL, 0, NULL) +{ + uint8_t buf[] = {'h', 'e', 'l', 'l', 'o', 'w'}; + const void *cursor = buf; + munit_assert_ptr_equal(byteGetString(&cursor, sizeof buf), NULL); + munit_assert_ptr_equal(cursor, buf); + return MUNIT_OK; +} + +/****************************************************************************** + * + * byteGet64 + * + *****************************************************************************/ + +SUITE(byteGet64) + +TEST(byteGet64, success, NULL, NULL, 0, NULL) +{ + uint8_t *buf = munit_malloc(sizeof(uint64_t) * 2); + void *cursor1 = buf + 1; + const void *cursor2 = buf + 1; + bytePut64(&cursor1, 1); + munit_assert_int(byteGet64(&cursor2), ==, 1); + free(buf); + return MUNIT_OK; +} + +/****************************************************************************** + * + * byteSha1 + * + *****************************************************************************/ + +/* Assert that the 20 bytes contained in VALUE match the given DIGEST + * hexadecimal representation. */ +#define ASSERT_SHA1(VALUE, DIGEST) \ + do { \ + char _digest[41]; \ + unsigned _i; \ + for (_i = 0; _i < 20; _i++) { \ + unsigned _j = _i * 2; \ + sprintf(&_digest[_j], "%.2x", value[_i]); \ + _digest[_j] = toupper(_digest[_j]); \ + _digest[_j + 1] = toupper(_digest[_j + 1]); \ + } \ + _digest[40] = '\0'; \ + munit_assert_string_equal(_digest, DIGEST); \ + } while (0) + +SUITE(byteSha1) + +TEST(byteSha1, abc, NULL, NULL, 0, NULL) +{ + struct byteSha1 sha1; + uint8_t text[] = "abc"; + uint8_t value[20]; + byteSha1Init(&sha1); + byteSha1Update(&sha1, text, sizeof text - 1); + byteSha1Digest(&sha1, value); + ASSERT_SHA1(value, "A9993E364706816ABA3E25717850C26C9CD0D89D"); + return MUNIT_OK; +} + +TEST(byteSha1, abcWithZeroLen, NULL, NULL, 0, NULL) +{ + struct byteSha1 sha1; + uint8_t text[] = "abc"; + uint8_t garbage[] = "garbage"; + uint8_t value[20]; + byteSha1Init(&sha1); + byteSha1Update(&sha1, text, sizeof text - 1); + /* Update with 0 length buffer doesn't change digest */ + byteSha1Update(&sha1, garbage, 0); + byteSha1Digest(&sha1, value); + ASSERT_SHA1(value, "A9993E364706816ABA3E25717850C26C9CD0D89D"); + return MUNIT_OK; +} + +TEST(byteSha1, abcbd, NULL, NULL, 0, NULL) +{ + struct byteSha1 sha1; + uint8_t text[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"; + uint8_t value[20]; + byteSha1Init(&sha1); + byteSha1Update(&sha1, text, sizeof text - 1); + byteSha1Digest(&sha1, value); + ASSERT_SHA1(value, "84983E441C3BD26EBAAE4AA1F95129E5E54670F1"); + return MUNIT_OK; +} diff --git a/test/raft/unit/test_compress.c b/test/raft/unit/test_compress.c new file mode 100644 index 000000000..0b5a2126c --- /dev/null +++ b/test/raft/unit/test_compress.c @@ -0,0 +1,319 @@ +#include "../../../src/raft/byte.h" +#include "../../../src/raft/compress.h" +#include "../lib/munit.h" +#include "../lib/runner.h" + +#include +#ifdef LZ4_AVAILABLE +#include +#endif + +SUITE(Compress) + +struct raft_buffer getBufWithRandom(size_t len) +{ + struct raft_buffer buf = {0}; + buf.len = len; + buf.base = munit_malloc(buf.len); + if (len != 0) { + munit_assert_ptr_not_null(buf.base); + } + + size_t offset = 0; + /* Write as many random ints in buf as possible */ + for (size_t n = buf.len / sizeof(int); n > 0; n--) { + *((int *)(buf.base) + offset) = rand(); + offset += 1; + } + + /* Fill the remaining bytes */ + size_t rem = buf.len % sizeof(int); + /* Offset will now be used in char* arithmetic */ + offset *= sizeof(int); + if (rem) { + int r_int = rand(); + for (unsigned i = 0; i < rem; i++) { + *((char *)buf.base + offset) = *((char *)&r_int + i); + offset++; + } + } + + munit_assert_ulong(offset, ==, buf.len); + return buf; +} + +struct raft_buffer getBufWithNonRandom(size_t len) +{ + struct raft_buffer buf = {0}; + buf.len = len; + buf.base = munit_malloc(buf.len); + if (len != 0) { + munit_assert_ptr_not_null(buf.base); + } + + memset(buf.base, 0xAC, buf.len); + return buf; +} + +#ifdef LZ4_AVAILABLE + +static void sha1(struct raft_buffer bufs[], unsigned n_bufs, uint8_t value[20]) +{ + struct byteSha1 sha; + byteSha1Init(&sha); + for (unsigned i = 0; i < n_bufs; i++) { + byteSha1Update(&sha, (const uint8_t *)bufs[i].base, + (uint32_t)bufs[i].len); + } + byteSha1Digest(&sha, value); +} + +TEST(Compress, compressDecompressZeroLength, NULL, NULL, 0, NULL) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; + struct raft_buffer bufs1[2] = {{NULL, 0}, + {(void *)0xDEADBEEF, 0}}; /* 0 length */ + struct raft_buffer bufs2[2] = {{(void *)0xDEADBEEF, 0}, + {NULL, 0}}; /* 0 length */ + struct raft_buffer compressed = {0}; + munit_assert_int(Compress(&bufs1[0], 1, &compressed, errmsg), ==, + RAFT_INVALID); + munit_assert_int(Compress(&bufs1[1], 1, &compressed, errmsg), ==, + RAFT_INVALID); + munit_assert_int(Compress(bufs1, 2, &compressed, errmsg), ==, RAFT_INVALID); + munit_assert_int(Compress(bufs2, 2, &compressed, errmsg), ==, RAFT_INVALID); + return MUNIT_OK; +} + +static char *len_one_params[] = { + /* 16B 1KB 64KB 4MB 128MB */ + "16", "1024", "65536", "4194304", "134217728", + /* Around Blocksize*/ + "65516", "65517", "65518", "65521", "65535", "65537", "65551", "65555", + "65556", + /* Ugly lengths */ + "0", "1", "9", "123450", "1337", "6655111", NULL}; + +static MunitParameterEnum random_one_params[] = { + {"len_one", len_one_params}, + {NULL, NULL}, +}; + +TEST(Compress, compressDecompressRandomOne, NULL, NULL, 0, random_one_params) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; + struct raft_buffer compressed = {0}; + struct raft_buffer decompressed = {0}; + uint8_t sha1_virgin[20] = {0}; + uint8_t sha1_decompressed[20] = {1}; + + /* Fill a buffer with random data */ + size_t len = strtoul(munit_parameters_get(params, "len_one"), NULL, 0); + if (len == 0) { + return MUNIT_SKIP; + } + struct raft_buffer buf = getBufWithRandom(len); + + /* Assert that after compression and decompression the data is unchanged */ + sha1(&buf, 1, sha1_virgin); + munit_assert_int(Compress(&buf, 1, &compressed, errmsg), ==, 0); + free(buf.base); + munit_assert_true(IsCompressed(compressed.base, compressed.len)); + munit_assert_int(Decompress(compressed, &decompressed, errmsg), ==, 0); + munit_assert_ulong(decompressed.len, ==, len); + sha1(&decompressed, 1, sha1_decompressed); + munit_assert_int(memcmp(sha1_virgin, sha1_decompressed, 20), ==, 0); + + raft_free(compressed.base); + raft_free(decompressed.base); + return MUNIT_OK; +} + +static char *len_nonrandom_one_params[] = { +#if !defined(__LP64__) && \ + (defined(__arm__) || defined(__i386__) || defined(__mips__)) + /* 4KB 64KB 4MB 1GB INT_MAX (larger allocations + fail on 32-bit archs */ + "4096", "65536", "4194304", "1073741824", "2147483647", +#else + /* 4KB 64KB 4MB 1GB 2GB + 200MB */ + "4096", "65536", "4194304", "1073741824", "2357198848", +#endif + /* Around Blocksize*/ + "65516", "65517", "65518", "65521", "65535", "65537", "65551", "65555", + "65556", + /* Ugly lengths */ + "0", "993450", "31337", "83883825", NULL}; + +static MunitParameterEnum nonrandom_one_params[] = { + {"len_one", len_nonrandom_one_params}, + {NULL, NULL}, +}; + +TEST(Compress, + compressDecompressNonRandomOne, + NULL, + NULL, + 0, + nonrandom_one_params) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; + struct raft_buffer compressed = {0}; + struct raft_buffer decompressed = {0}; + uint8_t sha1_virgin[20] = {0}; + uint8_t sha1_decompressed[20] = {1}; + + /* Fill a buffer with non-random data */ + size_t len = strtoul(munit_parameters_get(params, "len_one"), NULL, 0); + if (len == 0) { + return MUNIT_SKIP; + } + struct raft_buffer buf = getBufWithNonRandom(len); + + /* Assert that after compression and decompression the data is unchanged and + * that the compressed data is actually smaller */ + sha1(&buf, 1, sha1_virgin); + munit_assert_int(Compress(&buf, 1, &compressed, errmsg), ==, 0); + free(buf.base); + munit_assert_true(IsCompressed(compressed.base, compressed.len)); + if (len > 0) { + munit_assert_ulong(compressed.len, <, buf.len); + } + munit_assert_int(Decompress(compressed, &decompressed, errmsg), ==, 0); + munit_assert_ulong(decompressed.len, ==, len); + sha1(&decompressed, 1, sha1_decompressed); + munit_assert_int(memcmp(sha1_virgin, sha1_decompressed, 20), ==, 0); + + raft_free(compressed.base); + raft_free(decompressed.base); + return MUNIT_OK; +} + +static char *len_two_params[] = {"4194304", "13373", "66", "0", NULL}; + +static MunitParameterEnum random_two_params[] = { + {"len_one", len_one_params}, + {"len_two", len_two_params}, + {NULL, NULL}, +}; + +TEST(Compress, compressDecompressRandomTwo, NULL, NULL, 0, random_two_params) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; + struct raft_buffer compressed = {0}; + struct raft_buffer decompressed = {0}; + uint8_t sha1_virgin[20] = {0}; + uint8_t sha1_single[20] = {0}; + uint8_t sha1_decompressed[20] = {1}; + + /* Fill two buffers with random data */ + size_t len1 = strtoul(munit_parameters_get(params, "len_one"), NULL, 0); + size_t len2 = strtoul(munit_parameters_get(params, "len_two"), NULL, 0); + if (len1 + len2 == 0) { + return MUNIT_SKIP; + } + struct raft_buffer buf1 = getBufWithRandom(len1); + struct raft_buffer buf2 = getBufWithRandom(len2); + struct raft_buffer bufs[2] = {buf1, buf2}; + + /* If one of the buffers is empty ensure data is identical to single buffer + * case. */ + if (len1 == 0) { + sha1(&buf2, 1, sha1_single); + } else if (len2 == 0) { + sha1(&buf1, 1, sha1_single); + } + + /* Assert that after compression and decompression the data is unchanged */ + sha1(bufs, 2, sha1_virgin); + munit_assert_int(Compress(bufs, 2, &compressed, errmsg), ==, 0); + free(buf1.base); + free(buf2.base); + munit_assert_true(IsCompressed(compressed.base, compressed.len)); + munit_assert_int(Decompress(compressed, &decompressed, errmsg), ==, 0); + munit_assert_ulong(decompressed.len, ==, buf1.len + buf2.len); + sha1(&decompressed, 1, sha1_decompressed); + munit_assert_int(memcmp(sha1_virgin, sha1_decompressed, 20), ==, 0); + + if (len1 == 0 || len2 == 0) { + munit_assert_int(memcmp(sha1_single, sha1_virgin, 20), ==, 0); + munit_assert_int(memcmp(sha1_single, sha1_decompressed, 20), ==, 0); + } + + raft_free(compressed.base); + raft_free(decompressed.base); + return MUNIT_OK; +} + +TEST(Compress, compressDecompressCorruption, NULL, NULL, 0, NULL) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; + struct raft_buffer compressed = {0}; + struct raft_buffer decompressed = {0}; + + /* Fill a buffer with random data */ + size_t len = 2048; + struct raft_buffer buf = getBufWithRandom(len); + + munit_assert_int(Compress(&buf, 1, &compressed, errmsg), ==, 0); + munit_assert_true(IsCompressed(compressed.base, compressed.len)); + + /* Corrupt the a data byte after the header */ + munit_assert_ulong(LZ4F_HEADER_SIZE_MAX_RAFT, <, compressed.len); + ((char *)compressed.base)[LZ4F_HEADER_SIZE_MAX_RAFT] += 1; + + munit_assert_int(Decompress(compressed, &decompressed, errmsg), !=, 0); + munit_assert_string_equal(errmsg, + "LZ4F_decompress ERROR_contentChecksum_invalid"); + munit_assert_ptr_null(decompressed.base); + + raft_free(compressed.base); + free(buf.base); + return MUNIT_OK; +} + +#else + +TEST(Compress, lz4Disabled, NULL, NULL, 0, NULL) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0}; + struct raft_buffer compressed = {0}; + + /* Fill a buffer with random data */ + size_t len = 2048; + struct raft_buffer buf = getBufWithRandom(len); + + munit_assert_int(Compress(&buf, 1, &compressed, errmsg), ==, RAFT_INVALID); + munit_assert_ptr_null(compressed.base); + + free(buf.base); + return MUNIT_OK; +} + +#endif /* LZ4_AVAILABLE */ + +static const char LZ4_MAGIC[4] = {0x04, 0x22, 0x4d, 0x18}; +TEST(Compress, isCompressedTooSmall, NULL, NULL, 0, NULL) +{ + munit_assert_false(IsCompressed(&LZ4_MAGIC[1], sizeof(LZ4_MAGIC) - 1)); + return MUNIT_OK; +} + +TEST(Compress, isCompressedNull, NULL, NULL, 0, NULL) +{ + munit_assert_false(IsCompressed(NULL, sizeof(LZ4_MAGIC))); + return MUNIT_OK; +} + +TEST(Compress, isCompressed, NULL, NULL, 0, NULL) +{ + munit_assert_true(IsCompressed(LZ4_MAGIC, sizeof(LZ4_MAGIC))); + return MUNIT_OK; +} + +TEST(Compress, notCompressed, NULL, NULL, 0, NULL) +{ + char not_compressed[4] = {0x18, 0x4d, 0x22, 0x04}; + munit_assert_false(IsCompressed(not_compressed, sizeof(not_compressed))); + return MUNIT_OK; +} diff --git a/test/raft/unit/test_configuration.c b/test/raft/unit/test_configuration.c new file mode 100644 index 000000000..91f6d9792 --- /dev/null +++ b/test/raft/unit/test_configuration.c @@ -0,0 +1,638 @@ +#include "../../../src/raft/byte.h" +#include "../../../src/raft/configuration.h" +#include "../lib/heap.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_HEAP; + struct raft_configuration configuration; +}; + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SET_UP_HEAP; + configurationInit(&f->configuration); + return f; +} + +static void tearDownNoClose(void *data) +{ + struct fixture *f = data; + TEAR_DOWN_HEAP; + free(f); +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + configurationClose(&f->configuration); + tearDownNoClose(data); +} + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +/* Accessors */ +#define VOTER_COUNT configurationVoterCount(&f->configuration) +#define INDEX_OF(ID) configurationIndexOf(&f->configuration, ID) +#define INDEX_OF_VOTER(ID) configurationIndexOfVoter(&f->configuration, ID) +#define GET(ID) configurationGet(&f->configuration, ID) + +/* Add a server to the fixture's configuration. */ +#define ADD_RV(ID, ADDRESS, ROLE) \ + configurationAdd(&f->configuration, ID, ADDRESS, ROLE) +#define ADD(...) munit_assert_int(ADD_RV(__VA_ARGS__), ==, 0) +#define ADD_ERROR(RV, ...) munit_assert_int(ADD_RV(__VA_ARGS__), ==, RV) + +/* Remove a server from the fixture's configuration */ +#define REMOVE_RV(ID) configurationRemove(&f->configuration, ID) +#define REMOVE(...) munit_assert_int(REMOVE_RV(__VA_ARGS__), ==, 0) +#define REMOVE_ERROR(RV, ...) munit_assert_int(REMOVE_RV(__VA_ARGS__), ==, RV) + +/* Copy the fixture's configuration into the given one. */ +#define COPY_RV(CONF) configurationCopy(&f->configuration, CONF) +#define COPY(...) munit_assert_int(COPY_RV(__VA_ARGS__), ==, 0) +#define COPY_ERROR(RV, ...) munit_assert_int(COPY_RV(__VA_ARGS__), ==, RV) + +/* Encode the fixture's configuration into the given buffer. */ +#define ENCODE_RV(BUF) configurationEncode(&f->configuration, BUF) +#define ENCODE(...) munit_assert_int(ENCODE_RV(__VA_ARGS__), ==, 0) +#define ENCODE_ERROR(RV, ...) munit_assert_int(ENCODE_RV(__VA_ARGS__), ==, RV) + +/* Decode the given buffer into the fixture's configuration. */ +#define DECODE_RV(BUF) configurationDecode(BUF, &f->configuration) +#define DECODE(...) munit_assert_int(DECODE_RV(__VA_ARGS__), ==, 0) +#define DECODE_ERROR(RV, ...) munit_assert_int(DECODE_RV(__VA_ARGS__), ==, RV) + +/****************************************************************************** + * + * Assertions + * + *****************************************************************************/ + +/* Assert that the fixture's configuration has n servers. */ +#define ASSERT_N(N) \ + { \ + munit_assert_int(f->configuration.n, ==, N); \ + if (N == 0) { \ + munit_assert_ptr_null(f->configuration.servers); \ + } else { \ + munit_assert_ptr_not_null(f->configuration.servers); \ + } \ + } + +/* Assert that the attributes of the I'th server in the fixture's configuration + * match the given values. */ +#define ASSERT_SERVER(I, ID, ADDRESS, ROLE) \ + { \ + struct raft_server *server; \ + munit_assert_int(I, <, f->configuration.n); \ + server = &f->configuration.servers[I]; \ + munit_assert_int(server->id, ==, ID); \ + munit_assert_string_equal(server->address, ADDRESS); \ + munit_assert_int(server->role, ==, ROLE); \ + } + +/****************************************************************************** + * + * configurationVoterCount + * + *****************************************************************************/ + +SUITE(configurationVoterCount) + +/* All servers are voting. */ +TEST(configurationVoterCount, all_voters, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "192.168.1.1:666", RAFT_VOTER); + ADD(2, "192.168.1.2:666", RAFT_VOTER); + munit_assert_int(VOTER_COUNT, ==, 2); + return MUNIT_OK; +} + +/* Return only voting servers. */ +TEST(configurationVoterCount, filter, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "192.168.1.1:666", RAFT_VOTER); + ADD(2, "192.168.1.2:666", RAFT_STANDBY); + munit_assert_int(VOTER_COUNT, ==, 1); + return MUNIT_OK; +} + +/****************************************************************************** + * + * configurationIndexOf + * + *****************************************************************************/ + +SUITE(configurationIndexOf) + +/* If a matching server is found, it's index is returned. */ +TEST(configurationIndexOf, match, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "192.168.1.1:666", RAFT_VOTER); + ADD(2, "192.168.1.2:666", RAFT_STANDBY); + munit_assert_int(INDEX_OF(2), ==, 1); + return MUNIT_OK; +} + +/* If no matching server is found, the length of the configuration is + * returned. */ +TEST(configurationIndexOf, no_match, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "127.0.0.1:666", RAFT_VOTER); + munit_assert_int(INDEX_OF(3), ==, f->configuration.n); + return MUNIT_OK; +} + +/****************************************************************************** + * + * configurationIndexOfVoter + * + *****************************************************************************/ + +SUITE(configurationIndexOfVoter) + +/* The index of the matching voting server (relative to the number of voting + servers) is returned. */ +TEST(configurationIndexOfVoter, match, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "192.168.1.1:666", RAFT_STANDBY); + ADD(2, "192.168.1.2:666", RAFT_VOTER); + ADD(3, "192.168.1.3:666", RAFT_VOTER); + munit_assert_int(INDEX_OF_VOTER(3), ==, 1); + return MUNIT_OK; +} + +/* If no matching server is found, the length of the configuration is + * returned. */ +TEST(configurationIndexOfVoter, no_match, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "192.168.1.1:666", RAFT_VOTER); + munit_assert_int(INDEX_OF_VOTER(3), ==, 1); + return MUNIT_OK; +} + +/* If the server exists but is non-voting, the length of the configuration is + * returned. */ +TEST(configurationIndexOfVoter, non_voting, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "192.168.1.1:666", RAFT_STANDBY); + munit_assert_int(INDEX_OF_VOTER(1), ==, 1); + return MUNIT_OK; +} + +/****************************************************************************** + * + * configurationGet + * + *****************************************************************************/ + +SUITE(configurationGet) + +/* If a matching server is found, it's returned. */ +TEST(configurationGet, match, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + const struct raft_server *server; + ADD(1, "192.168.1.1:666", RAFT_VOTER); + ADD(2, "192.168.1.2:666", RAFT_STANDBY); + server = GET(2); + munit_assert_ptr_not_null(server); + munit_assert_int(server->id, ==, 2); + munit_assert_string_equal(server->address, "192.168.1.2:666"); + return MUNIT_OK; +} + +/* If no matching server is found, NULL is returned. */ +TEST(configurationGet, no_match, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "127.0.0.1:666", RAFT_VOTER); + munit_assert_ptr_null(GET(3)); + return MUNIT_OK; +} + +/****************************************************************************** + * + * configurationCopy + * + *****************************************************************************/ + +SUITE(configurationCopy) + +/* Copy a configuration containing two servers */ +TEST(configurationCopy, two, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_configuration configuration; + ADD(1, "192.168.1.1:666", RAFT_STANDBY); + ADD(2, "192.168.1.2:666", RAFT_VOTER); + COPY(&configuration); + munit_assert_int(configuration.n, ==, 2); + munit_assert_int(configuration.servers[0].id, ==, 1); + munit_assert_int(configuration.servers[1].id, ==, 2); + configurationClose(&configuration); + return MUNIT_OK; +} + +static char *copy_oom_heap_fault_delay[] = {"0", "1", "2", NULL}; +static char *copy_oom_heap_fault_repeat[] = {"1", NULL}; + +static MunitParameterEnum copy_oom_params[] = { + {TEST_HEAP_FAULT_DELAY, copy_oom_heap_fault_delay}, + {TEST_HEAP_FAULT_REPEAT, copy_oom_heap_fault_repeat}, + {NULL, NULL}, +}; + +/* Out of memory */ +TEST(configurationCopy, oom, setUp, tearDown, 0, copy_oom_params) +{ + struct fixture *f = data; + struct raft_configuration configuration; + ADD(1, "192.168.1.1:666", RAFT_STANDBY); + ADD(2, "192.168.1.2:666", RAFT_VOTER); + HEAP_FAULT_ENABLE; + COPY_ERROR(RAFT_NOMEM, &configuration); + return MUNIT_OK; +} + +/****************************************************************************** + * + * raft_configuration_add + * + *****************************************************************************/ + +SUITE(configurationAdd) + +/* Add a server to the configuration. */ +TEST(configurationAdd, one, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "127.0.0.1:666", RAFT_VOTER); + ASSERT_N(1); + ASSERT_SERVER(0, 1, "127.0.0.1:666", RAFT_VOTER); + return MUNIT_OK; +} + +/* Add two servers to the configuration. */ +TEST(configurationAdd, two, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "127.0.0.1:666", RAFT_VOTER); + ADD(2, "192.168.1.1:666", RAFT_STANDBY); + ASSERT_N(2); + ASSERT_SERVER(0, 1, "127.0.0.1:666", RAFT_VOTER); + ASSERT_SERVER(1, 2, "192.168.1.1:666", RAFT_STANDBY); + return MUNIT_OK; +} + +/* Add a server with an ID which is already in use. */ +TEST(configurationAdd, duplicateId, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "127.0.0.1:666", RAFT_VOTER); + ADD_ERROR(RAFT_DUPLICATEID, 1, "192.168.1.1:666", RAFT_STANDBY); + return MUNIT_OK; +} + +/* Add a server with an address which is already in use. */ +TEST(configurationAdd, duplicateAddress, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "127.0.0.1:666", RAFT_VOTER); + ADD_ERROR(RAFT_DUPLICATEADDRESS, 2, "127.0.0.1:666", RAFT_STANDBY); + return MUNIT_OK; +} + +/* Add a server with an invalid role. */ +TEST(configurationAdd, invalidRole, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD_ERROR(RAFT_BADROLE, 2, "127.0.0.1:666", 666); + return MUNIT_OK; +} + +static char *add_oom_heap_fault_delay[] = {"0", "1", NULL}; +static char *add_oom_heap_fault_repeat[] = {"1", NULL}; + +static MunitParameterEnum add_oom_params[] = { + {TEST_HEAP_FAULT_DELAY, add_oom_heap_fault_delay}, + {TEST_HEAP_FAULT_REPEAT, add_oom_heap_fault_repeat}, + {NULL, NULL}, +}; + +/* Out of memory. */ +TEST(configurationAdd, oom, setUp, tearDown, 0, add_oom_params) +{ + struct fixture *f = data; + HeapFaultEnable(&f->heap); + ADD_ERROR(RAFT_NOMEM, 1, "127.0.0.1:666", RAFT_VOTER); + munit_assert_null(f->configuration.servers); + return MUNIT_OK; +} + +/****************************************************************************** + * + * configurationRemove + * + *****************************************************************************/ + +SUITE(configurationRemove) + +/* Remove the last and only server. */ +TEST(configurationRemove, last, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "127.0.0.1:666", RAFT_VOTER); + REMOVE(1); + ASSERT_N(0); + return MUNIT_OK; +} + +/* Remove the first server. */ +TEST(configurationRemove, first, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "127.0.0.1:666", RAFT_VOTER); + ADD(2, "192.168.1.1:666", RAFT_STANDBY); + REMOVE(1); + ASSERT_N(1); + ASSERT_SERVER(0, 2, "192.168.1.1:666", RAFT_STANDBY); + return MUNIT_OK; +} + +/* Remove a server in the middle. */ +TEST(configurationRemove, middle, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "127.0.0.1:666", RAFT_VOTER); + ADD(2, "192.168.1.1:666", RAFT_STANDBY); + ADD(3, "10.0.1.1:666", RAFT_VOTER); + REMOVE(2); + ASSERT_N(2); + ASSERT_SERVER(0, 1, "127.0.0.1:666", RAFT_VOTER); + ASSERT_SERVER(1, 3, "10.0.1.1:666", RAFT_VOTER); + return MUNIT_OK; +} + +/* Attempts to remove a server with an unknown ID result in an error. */ +TEST(configurationRemove, unknownId, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + REMOVE_ERROR(RAFT_BADID, 1); + return MUNIT_OK; +} + +/* Out of memory. */ +TEST(configurationRemove, oom, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ADD(1, "127.0.0.1:666", RAFT_VOTER); + ADD(2, "192.168.1.1:666", RAFT_STANDBY); + HeapFaultConfig(&f->heap, 0, 1); + HeapFaultEnable(&f->heap); + REMOVE_ERROR(RAFT_NOMEM, 1); + return MUNIT_OK; +} + +/****************************************************************************** + * + * configurationEncode + * + *****************************************************************************/ + +SUITE(configurationEncode) + +/* Encode a configuration with one server. */ +TEST(configurationEncode, one_server, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_buffer buf; + size_t len; + const void *cursor; + const char *address = "127.0.0.1:666"; + ADD(1, address, RAFT_VOTER); + ENCODE(&buf); + + len = 1 + 8 + /* Version and n of servers */ + 8 + strlen(address) + 1; /* Server */ + len = bytePad64(len); + + munit_assert_int(buf.len, ==, len); + + cursor = buf.base; + + munit_assert_int(byteGet8(&cursor), ==, 1); + munit_assert_int(byteGet64(&cursor), ==, 1); + + munit_assert_int(byteGet64(&cursor), ==, 1); + munit_assert_string_equal(byteGetString(&cursor, strlen(address) + 1), + address); + munit_assert_int(byteGet8(&cursor), ==, RAFT_VOTER); + + raft_free(buf.base); + + return MUNIT_OK; +} + +/* Encode a configuration with two servers. */ +TEST(configurationEncode, two_servers, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_buffer buf; + size_t len; + const void *cursor; + const char *address1 = "127.0.0.1:666"; + const char *address2 = "192.168.1.1:666"; + + ADD(1, address1, RAFT_STANDBY); + ADD(2, address2, RAFT_VOTER); + ENCODE(&buf); + + len = 1 + 8 + /* Version and n of servers */ + 8 + strlen(address1) + 1 + 1 + /* Server 1 */ + 8 + strlen(address2) + 1 + 1; /* Server 2 */ + len = bytePad64(len); + + munit_assert_int(buf.len, ==, len); + + cursor = buf.base; + + munit_assert_int(byteGet8(&cursor), ==, 1); + munit_assert_int(byteGet64(&cursor), ==, 2); + + munit_assert_int(byteGet64(&cursor), ==, 1); + munit_assert_string_equal(byteGetString(&cursor, strlen(address1) + 1), + address1); + munit_assert_int(byteGet8(&cursor), ==, RAFT_STANDBY); + + munit_assert_int(byteGet64(&cursor), ==, 2); + munit_assert_string_equal(byteGetString(&cursor, strlen(address2) + 1), + address2); + munit_assert_int(byteGet8(&cursor), ==, RAFT_VOTER); + + raft_free(buf.base); + + return MUNIT_OK; +} + +/* Out of memory. */ +TEST(configurationEncode, oom, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_buffer buf; + HeapFaultConfig(&f->heap, 2, 1); + HeapFaultEnable(&f->heap); + ADD(1, "127.0.0.1:666", RAFT_VOTER); + ENCODE_ERROR(RAFT_NOMEM, &buf); + return MUNIT_OK; +} + +/****************************************************************************** + * + * configurationDecode + * + *****************************************************************************/ + +SUITE(configurationDecode) + +/* The decode a payload encoding a configuration with one server */ +TEST(configurationDecode, one_server, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint8_t bytes[] = {1, /* Version */ + 1, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */ + 5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ + 'x', '.', 'y', 0, /* Server address */ + 1}; /* Role code */ + struct raft_buffer buf; + + buf.base = bytes; + buf.len = sizeof bytes; + + DECODE(&buf); + + ASSERT_N(1); + ASSERT_SERVER(0, 5, "x.y", RAFT_VOTER); + + return MUNIT_OK; +} + +/* The decode size is the size of a raft_server array plus the length of the + * addresses. */ +TEST(configurationDecode, two_servers, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + uint8_t bytes[] = {1, /* Version */ + 2, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */ + 5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ + 'x', '.', 'y', 0, /* Server address */ + 1, /* Role code */ + 3, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ + '1', '9', '2', '.', '2', 0, /* Server address */ + 0}; /* Role code */ + struct raft_buffer buf; + buf.base = bytes; + buf.len = sizeof bytes; + DECODE(&buf); + ASSERT_N(2); + ASSERT_SERVER(0, 5, "x.y", RAFT_VOTER); + ASSERT_SERVER(1, 3, "192.2", RAFT_STANDBY); + return MUNIT_OK; +} + +static char *decode_oom_heap_fault_delay[] = {"0", "1", "2", "3", NULL}; +static char *decode_oom_heap_fault_repeat[] = {"1", NULL}; + +static MunitParameterEnum decode_oom_params[] = { + {TEST_HEAP_FAULT_DELAY, decode_oom_heap_fault_delay}, + {TEST_HEAP_FAULT_REPEAT, decode_oom_heap_fault_repeat}, + {NULL, NULL}, +}; + +/* Not enough memory for creating the decoded configuration object. */ +TEST(configurationDecode, oom, setUp, tearDownNoClose, 0, decode_oom_params) +{ + struct fixture *f = data; + uint8_t bytes[] = {1, /* Version */ + 2, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */ + 5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ + 'x', '.', 'y', 0, /* Server address */ + 1, /* Role code */ + 3, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ + 'z', '.', 'w', 0, /* Server address */ + 0}; /* Role code */ + struct raft_buffer buf; + HEAP_FAULT_ENABLE; + buf.base = bytes; + buf.len = sizeof bytes; + DECODE_ERROR(RAFT_NOMEM, &buf); + return MUNIT_OK; +} + +/* If the encoding version is wrong, an error is returned. */ +TEST(configurationDecode, badVersion, setUp, tearDownNoClose, 0, NULL) +{ + struct fixture *f = data; + uint8_t bytes = 127; + struct raft_buffer buf; + buf.base = &bytes; + buf.len = 1; + DECODE_ERROR(RAFT_MALFORMED, &buf); + return MUNIT_OK; +} + +/* The address of a server is not a nul-terminated string. */ +TEST(configurationDecode, badAddress, setUp, tearDownNoClose, 0, NULL) +{ + struct fixture *f = data; + uint8_t bytes[] = {1, /* Version */ + 1, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */ + 5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ + 'x', '.', 'y', /* Server address */ + 1}; /* Voting flag */ + struct raft_buffer buf; + buf.base = bytes; + buf.len = sizeof bytes; + DECODE_ERROR(RAFT_MALFORMED, &buf); + return MUNIT_OK; +} + +/* The encoded configuration is invalid because it has a duplicated server + * ID. In that case RAFT_MALFORMED is returned. */ +TEST(configurationDecode, duplicatedID, setUp, tearDownNoClose, 0, NULL) +{ + struct fixture *f = data; + uint8_t bytes[] = {1, /* Version */ + 2, 0, 0, 0, 0, 0, 0, 0, /* Number of servers */ + 5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ + 'x', '.', 'y', 0, /* Server address */ + 1, /* Role code */ + 5, 0, 0, 0, 0, 0, 0, 0, /* Server ID */ + 'z', '.', 'w', 0, /* Server address */ + 0}; /* Role code */ + struct raft_buffer buf; + buf.base = bytes; + buf.len = sizeof bytes; + DECODE_ERROR(RAFT_MALFORMED, &buf); + return MUNIT_OK; +} diff --git a/test/raft/unit/test_err.c b/test/raft/unit/test_err.c new file mode 100644 index 000000000..95f88ea28 --- /dev/null +++ b/test/raft/unit/test_err.c @@ -0,0 +1,87 @@ +#include +#include + +#include "../../../src/raft/err.h" +#include "../lib/heap.h" +#include "../lib/runner.h" + +/* An error messages which is 249 characters. */ +#define LONG_ERRMSG \ + "boom boom boom boom boom boom boom boom boom boom boom boom boom boom " \ + "boom boom boom boom boom boom boom boom boom boom boom boom boom boom " \ + "boom boom boom boom boom boom boom boom boom boom boom boom boom boom " \ + "boom boom boom boom boom boom boom boom" + +/****************************************************************************** + * + * ErrMsgPrintf + * + *****************************************************************************/ + +SUITE(ErrMsgPrintf) + +/* The format string has no parameters. */ +TEST(ErrMsgPrintf, noParams, NULL, NULL, 0, NULL) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + ErrMsgPrintf(errmsg, "boom"); + munit_assert_string_equal(errmsg, "boom"); + return MUNIT_OK; +} + +/* The format string has parameters. */ +TEST(ErrMsgPrintf, params, NULL, NULL, 0, NULL) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + ErrMsgPrintf(errmsg, "boom %d", 123); + munit_assert_string_equal(errmsg, "boom 123"); + return MUNIT_OK; +} + +/****************************************************************************** + * + * ErrMsgWrapf + * + *****************************************************************************/ + +SUITE(ErrMsgWrapf) + +/* The wrapping format string has no parameters. */ +TEST(ErrMsgWrapf, noParams, NULL, NULL, 0, NULL) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + ErrMsgPrintf(errmsg, "boom"); + ErrMsgWrapf(errmsg, "no luck"); + munit_assert_string_equal(errmsg, "no luck: boom"); + return MUNIT_OK; +} + +/* The wrapping format string has parameters. */ +TEST(ErrMsgWrapf, params, NULL, NULL, 0, NULL) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + ErrMsgPrintf(errmsg, "boom"); + ErrMsgWrapf(errmsg, "no luck, %s", "joe"); + munit_assert_string_equal(errmsg, "no luck, joe: boom"); + return MUNIT_OK; +} + +/* The wrapped error message gets partially truncated. */ +TEST(ErrMsgWrapf, partialTruncate, NULL, NULL, 0, NULL) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + ErrMsgPrintf(errmsg, "no luck"); + ErrMsgWrapf(errmsg, LONG_ERRMSG); + munit_assert_string_equal(errmsg, LONG_ERRMSG ": no l"); + return MUNIT_OK; +} + +/* The wrapped error message gets entirely truncated. */ +TEST(ErrMsgWrapf, fullTruncate, NULL, NULL, 0, NULL) +{ + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + ErrMsgPrintf(errmsg, "no luck"); + ErrMsgWrapf(errmsg, LONG_ERRMSG " boom"); + munit_assert_string_equal(errmsg, LONG_ERRMSG " boom"); + return MUNIT_OK; +} diff --git a/test/raft/unit/test_flags.c b/test/raft/unit/test_flags.c new file mode 100644 index 000000000..7fbbe26db --- /dev/null +++ b/test/raft/unit/test_flags.c @@ -0,0 +1,97 @@ +#include "../../../src/raft/flags.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * flags + * + *****************************************************************************/ + +SUITE(flags) + +TEST(flags, empty, NULL, NULL, 0, NULL) +{ + raft_flags flags = 0; + for (int i = 0; i < 64; i++) { + munit_assert_false(flagsIsSet(flags, ((raft_flags)1) << i)); + } + return MUNIT_OK; +} + +TEST(flags, setClear, NULL, NULL, 0, NULL) +{ + raft_flags flags = 0; + raft_flags flag = 0; + for (int i = 0; i < 64; i++) { + flag = ((raft_flags)1) << i; + flags = flagsSet(flags, flag); + munit_assert_true(flagsIsSet(flags, flag)); + flags = flagsClear(flags, flag); + munit_assert_false(flagsIsSet(flags, flag)); + munit_assert_true(flags == 0); + } + return MUNIT_OK; +} + +TEST(flags, setMultipleClearMultiple, NULL, NULL, 0, NULL) +{ + raft_flags in = 0; + raft_flags out; + raft_flags flags = (raft_flags)(1 | 1 << 4 | 1 << 13 | (raft_flags)1 << 40 | + (raft_flags)1 << 63); + out = flagsSet(in, flags); + /* clang-format off */ + int positions[64] = { + 1, 0, 0, 0, 1, 0, 0, 0, // 0th and 4th + 0, 0, 0, 0, 0, 1, 0, 0, // 13th + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, // 40th + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, // 63th + }; + /* clang-format on */ + for (unsigned i = 0; i < 64; i++) { + if (positions[i]) { + munit_assert_true(flagsIsSet(out, (raft_flags)1 << i)); + } else { + munit_assert_false(flagsIsSet(out, (raft_flags)1 << i)); + } + } + out = flagsClear(out, flags); + munit_assert_true(out == 0); + return MUNIT_OK; +} + +TEST(flags, setMultipleClearSingle, NULL, NULL, 0, NULL) +{ + raft_flags in = 0; + raft_flags out; + raft_flags flags = (raft_flags)(1 << 3 | 1 << 5 | 1 << 18 | + (raft_flags)1 << 32 | (raft_flags)1 << 35); + out = flagsSet(in, flags); + /* clang-format off */ + int positions[64] = { + 0, 0, 0, 1, 0, 1, 0, 0, // 3rd and 5th + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, // 18th + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 1, 0, 0, 0, 0, // 32rd 35th + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + /* clang-format on */ + for (unsigned i = 0; i < 64; i++) { + if (positions[i]) { + munit_assert_true(flagsIsSet(out, (raft_flags)1 << i)); + } else { + munit_assert_false(flagsIsSet(out, (raft_flags)1 << i)); + } + } + out = flagsClear(out, (raft_flags)1 << 32); + munit_assert_true( + out == (raft_flags)(1 << 3 | 1 << 5 | 1 << 18 | (raft_flags)1 << 35)); + return MUNIT_OK; +} diff --git a/test/raft/unit/test_log.c b/test/raft/unit/test_log.c new file mode 100644 index 000000000..0820580aa --- /dev/null +++ b/test/raft/unit/test_log.c @@ -0,0 +1,1237 @@ +#include "../../../src/raft/configuration.h" +#include "../../../src/raft/log.h" +#include "../lib/heap.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_HEAP; + struct raft_log *log; +}; + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +/* Accessors */ +#define NUM_ENTRIES logNumEntries(f->log) +#define LAST_INDEX logLastIndex(f->log) +#define TERM_OF(INDEX) logTermOf(f->log, INDEX) +#define LAST_TERM logLastTerm(f->log) +#define GET(INDEX) logGet(f->log, INDEX) + +/* Append one command entry with the given term and a hard-coded payload. */ +#define APPEND(TERM) \ + { \ + struct raft_buffer buf_; \ + int rv_; \ + buf_.base = raft_malloc(8); \ + buf_.len = 8; \ + strcpy(buf_.base, "hello"); \ + rv_ = logAppend(f->log, TERM, RAFT_COMMAND, &buf_, NULL); \ + munit_assert_int(rv_, ==, 0); \ + } + +/* Same as APPEND, but repeated N times. */ +#define APPEND_MANY(TERM, N) \ + { \ + int i_; \ + for (i_ = 0; i_ < N; i_++) { \ + APPEND(TERM); \ + } \ + } + +/* Invoke append and assert that it returns the given error. */ +#define APPEND_ERROR(TERM, RV) \ + { \ + struct raft_buffer buf_; \ + int rv_; \ + buf_.base = raft_malloc(8); \ + buf_.len = 8; \ + rv_ = logAppend(f->log, TERM, RAFT_COMMAND, &buf_, NULL); \ + munit_assert_int(rv_, ==, RV); \ + raft_free(buf_.base); \ + } + +/* Append N entries all belonging to the same batch. Each entry will have 64-bit + * payload set to i * 1000, where i is the index of the entry in the batch. */ +#define APPEND_BATCH(N) \ + { \ + void *batch; \ + size_t offset; \ + int i; \ + batch = raft_malloc(8 * N); \ + munit_assert_ptr_not_null(batch); \ + offset = 0; \ + for (i = 0; i < N; i++) { \ + struct raft_buffer buf; \ + int rv; \ + buf.base = (uint8_t *)batch + offset; \ + buf.len = 8; \ + *(uint64_t *)buf.base = i * 1000; \ + rv = logAppend(f->log, 1, RAFT_COMMAND, &buf, batch); \ + munit_assert_int(rv, ==, 0); \ + offset += 8; \ + } \ + } + +#define ACQUIRE(INDEX) \ + { \ + int rv2; \ + rv2 = logAcquire(f->log, INDEX, &entries, &n); \ + munit_assert_int(rv2, ==, 0); \ + } + +#define RELEASE(INDEX) logRelease(f->log, INDEX, entries, n); + +#define TRUNCATE(N) logTruncate(f->log, N) +#define SNAPSHOT(INDEX, TRAILING) logSnapshot(f->log, INDEX, TRAILING) +#define RESTORE(INDEX, TERM) logRestore(f->log, INDEX, TERM) + +/****************************************************************************** + * + * Set up an empty configuration. + * + *****************************************************************************/ + +static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + SET_UP_HEAP; + f->log = logInit(); + if (f->log == NULL) { + munit_assert_true(false); + } + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + logClose(f->log); + TEAR_DOWN_HEAP; + free(f); +} + +/****************************************************************************** + * + * Assertions + * + *****************************************************************************/ + +/* Assert the state of the fixture's log in terms of size, front/back indexes, + * offset and number of entries. */ +#define ASSERT(SIZE, FRONT, BACK, OFFSET, N) \ + munit_assert_int(f->log->size, ==, SIZE); \ + munit_assert_int(f->log->front, ==, FRONT); \ + munit_assert_int(f->log->back, ==, BACK); \ + munit_assert_int(f->log->offset, ==, OFFSET); \ + munit_assert_int(logNumEntries(f->log), ==, N) + +/* Assert the last index and term of the most recent snapshot. */ +#define ASSERT_SNAPSHOT(INDEX, TERM) \ + munit_assert_int(f->log->snapshot.last_index, ==, INDEX); \ + munit_assert_int(f->log->snapshot.last_term, ==, TERM) + +/* Assert that the term of entry at INDEX equals TERM. */ +#define ASSERT_TERM_OF(INDEX, TERM) \ + { \ + const struct raft_entry *entry; \ + entry = logGet(f->log, INDEX); \ + munit_assert_ptr_not_null(entry); \ + munit_assert_int(entry->term, ==, TERM); \ + } + +/* Assert that the number of outstanding references for the entry at INDEX + * equals COUNT. */ +#define ASSERT_REFCOUNT(INDEX, COUNT) \ + { \ + size_t i; \ + munit_assert_ptr_not_null(f->log->refs); \ + for (i = 0; i < f->log->refs_size; i++) { \ + if (f->log->refs[i].index == INDEX) { \ + munit_assert_int(f->log->refs[i].count, ==, COUNT); \ + break; \ + } \ + } \ + if (i == f->log->refs_size) { \ + munit_errorf("no refcount found for entry with index %d", \ + (int)INDEX); \ + } \ + } + +/****************************************************************************** + * + * logNumEntries + * + *****************************************************************************/ + +SUITE(logNumEntries) + +/* If the log is empty, the return value is zero. */ +TEST(logNumEntries, empty, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + munit_assert_int(NUM_ENTRIES, ==, 0); + return MUNIT_OK; +} + +/* The log is not wrapped. */ +TEST(logNumEntries, not_wrapped, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1 /* term */); + munit_assert_int(NUM_ENTRIES, ==, 1); + return MUNIT_OK; +} + +/* The log is wrapped. */ +TEST(logNumEntries, wrapped, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(1 /* term */, 5 /* n entries */); + SNAPSHOT(4 /* last_index */, 1 /* trailing */); + APPEND_MANY(1 /* term */, 2 /* n entries */); + munit_assert_int(NUM_ENTRIES, ==, 4); + return MUNIT_OK; +} + +/* The log has an offset and is empty. */ +TEST(logNumEntries, offset, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(1 /* term */, 5 /* n entries */); + SNAPSHOT(5 /* last index */, 0 /* trailing */); + munit_assert_int(NUM_ENTRIES, ==, 0); + return MUNIT_OK; +} + +/* The log has an offset and is not empty. */ +TEST(logNumEntries, offsetNotEmpty, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(1 /* term */, 5 /* n entries */); + SNAPSHOT(4 /* last index */, 2 /* trailing */); + munit_assert_int(NUM_ENTRIES, ==, 3); + return MUNIT_OK; +} + +/****************************************************************************** + * + * logLastIndex + * + *****************************************************************************/ + +SUITE(logLastIndex) + +/* If the log is empty, last index is 0. */ +TEST(logLastIndex, empty, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + munit_assert_int(LAST_INDEX, ==, 0); + return MUNIT_OK; +} + +/* If the log is empty and has an offset, last index is calculated + accordingly. */ +TEST(logLastIndex, emptyWithOffset, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1); + SNAPSHOT(1, 0); + munit_assert_int(LAST_INDEX, ==, 1); + return MUNIT_OK; +} + +/* The log has one entry. */ +TEST(logLastIndex, one, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1 /* term */); + munit_assert_int(LAST_INDEX, ==, 1); + return MUNIT_OK; +} + +/* The log has two entries. */ +TEST(logLastIndex, two, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(1 /* term */, 2 /* n */); + munit_assert_int(LAST_INDEX, ==, 2); + return MUNIT_OK; +} + +/* If the log starts at a certain offset, the last index is bumped + * accordingly. */ +TEST(logLastIndex, twoWithOffset, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(1 /* term */, 5 /* n */); + SNAPSHOT(5 /* last index */, 2 /* trailing */); + munit_assert_int(LAST_INDEX, ==, 5); + return MUNIT_OK; +} + +/****************************************************************************** + * + * logLastTerm + * + *****************************************************************************/ + +SUITE(logLastTerm) + +/* If the log is empty, return zero. */ +TEST(logLastTerm, empty, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + munit_assert_int(LAST_TERM, ==, 0); + return MUNIT_OK; +} + +/* If the log has a snapshot and no outstanding entries, return the last term of + * the snapshot. */ +TEST(logLastTerm, snapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1 /* term */); + SNAPSHOT(1 /* last index */, 0 /* trailing */); + munit_assert_int(LAST_TERM, ==, 1); + return MUNIT_OK; +} + +/****************************************************************************** + * + * logTermOf + * + *****************************************************************************/ + +SUITE(logTermOf) + +/* If the given index is beyond the last index, return 0. */ +TEST(logTermOf, beyondLast, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + munit_assert_int(TERM_OF(2), ==, 0); + munit_assert_int(TERM_OF(10), ==, 0); + return MUNIT_OK; +} + +/* If the log is empty but has a snapshot, and the given index matches the last + * index of the snapshot, return the snapshot last term. */ +TEST(logTermOf, snapshotLastIndex, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(1 /* term */, 5 /* n entries */); + SNAPSHOT(5 /* last entry */, 0 /* trailing */); + munit_assert_int(TERM_OF(5), ==, 1); + return MUNIT_OK; +} + +/* The log has one entry. */ +TEST(logTermOf, one, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(3 /* term */); + munit_assert_int(TERM_OF(1), ==, 3); + return MUNIT_OK; +} + +/* The log has two entries. */ +TEST(logTermOf, two, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(4 /* term */, 2 /* n */); + munit_assert_int(TERM_OF(1), ==, 4); + munit_assert_int(TERM_OF(2), ==, 4); + return MUNIT_OK; +} + +/* The log has a snapshot and hence has an offset. */ +TEST(logTermOf, withSnapshot, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(1 /* term */, 5 /* n entries */); + SNAPSHOT(3 /* last index */, 0 /* trailing */); + munit_assert_int(TERM_OF(1), ==, 0); + munit_assert_int(TERM_OF(2), ==, 0); + munit_assert_int(TERM_OF(3), ==, 1); + munit_assert_int(TERM_OF(4), ==, 1); + munit_assert_int(TERM_OF(5), ==, 1); + return MUNIT_OK; +} + +/* The log has a snapshot with trailing entries. */ +TEST(logTermOf, snapshotTrailing, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(1 /* term */, 5 /* n entries */); + SNAPSHOT(3 /* last index */, 2 /* trailing */); + munit_assert_int(TERM_OF(1), ==, 0); + munit_assert_int(TERM_OF(2), ==, 1); + munit_assert_int(TERM_OF(3), ==, 1); + munit_assert_int(TERM_OF(4), ==, 1); + munit_assert_int(TERM_OF(5), ==, 1); + + return MUNIT_OK; +} + +/****************************************************************************** + * + * logGet + * + *****************************************************************************/ + +SUITE(logGet) + +/* The log is empty. */ +TEST(logGet, empty_log, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + munit_assert_ptr_null(GET(1)); + return MUNIT_OK; +} + +/* The log is empty but has an offset. */ +TEST(logGet, emptyWithOffset, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(4 /* term */, 10 /* n */); + SNAPSHOT(10 /* last index */, 0 /* trailing */); + munit_assert_ptr_null(GET(1)); + munit_assert_ptr_null(GET(10)); + munit_assert_ptr_null(GET(11)); + return MUNIT_OK; +} + +/* The log has one entry. */ +TEST(logGet, one, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(3 /* term */); + munit_assert_int(GET(1)->term, ==, 3); + munit_assert_ptr_null(GET(2)); + return MUNIT_OK; +} + +/* The log has two entries. */ +TEST(logGet, two, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(4 /* term */, 2 /* n */); + munit_assert_int(GET(1)->term, ==, 4); + munit_assert_int(GET(2)->term, ==, 4); + munit_assert_ptr_null(GET(3)); + return MUNIT_OK; +} + +/* The log starts at a certain offset. */ +TEST(logGet, twoWithOffset, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(1 /* term */, 3 /* n */); + APPEND(2 /* term */); + APPEND(3 /* term */); + SNAPSHOT(4 /* las index */, 1 /* trailing */); + munit_assert_ptr_null(GET(1)); + munit_assert_ptr_null(GET(2)); + munit_assert_ptr_null(GET(3)); + munit_assert_int(GET(4)->term, ==, 2); + munit_assert_int(GET(5)->term, ==, 3); + return MUNIT_OK; +} + +/****************************************************************************** + * + * logAppend + * + *****************************************************************************/ + +SUITE(logAppend) + +/* Append one entry to an empty log. */ +TEST(logAppend, one, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1 /* term */); + ASSERT(2 /* size */, + 0 /* front */, + 1 /* back */, + 0 /* offset */, + 1 /* n */); + ASSERT_TERM_OF(1 /* entry index */, 1 /* term */); + ASSERT_REFCOUNT(1 /* entry index */, 1 /* count */); + return MUNIT_OK; +} + +/* Append two entries to to an empty log. */ +TEST(logAppend, two, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND(1 /* term */); + APPEND(1 /* term */); + ASSERT(6 /* size */, + 0 /* front */, + 2 /* back */, + 0 /* offset */, + 2 /* n */); + ASSERT_TERM_OF(1 /* entry index */, 1 /* term */); + ASSERT_TERM_OF(2 /* entry index */, 1 /* term */); + ASSERT_REFCOUNT(1 /* entry index */, 1 /* count */); + ASSERT_REFCOUNT(2 /* entry index */, 1 /* count */); + return MUNIT_OK; +} + +/* Append three entries in sequence. */ +TEST(logAppend, three, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + + /* One -> [e1, NULL] */ + APPEND(1 /* term */); + + /* Two -> [e1, e2, NULL, NULL, NULL, NULL] */ + APPEND(1 /* term */); + + /* Three -> [e1, e2, e3, NULL, NULL, NULL] */ + APPEND(1 /* term */); + + ASSERT(6 /* size */, + 0 /* front */, + 3 /* back */, + 0 /* offset */, + 3 /* n */); + ASSERT_TERM_OF(1 /* entry index */, 1 /* term */); + ASSERT_TERM_OF(2 /* entry index */, 1 /* term */); + ASSERT_TERM_OF(3 /* entry index */, 1 /* term */); + ASSERT_REFCOUNT(1 /* entry index */, 1 /* count */); + ASSERT_REFCOUNT(2 /* entry index */, 1 /* count */); + ASSERT_REFCOUNT(3 /* entry index */, 1 /* count */); + + return MUNIT_OK; +} + +/* Append enough entries to force the reference count hash table to be + * resized. */ +TEST(logAppend, many, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + int i; + for (i = 0; i < 3000; i++) { + APPEND(1 /* term */); + } + munit_assert_int(f->log->refs_size, ==, 4096); + return MUNIT_OK; +} + +/* Append to wrapped log that needs to be grown. */ +TEST(logAppend, wrap, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + + APPEND_MANY(1 /* term */, 5 /* n */); + + /* Now the log is [e1, e2, e3, e4, e5, NULL] */ + ASSERT(6 /* size */, + 0 /* front */, + 5 /* back */, + 0 /* offset */, + 5 /* n */); + + /* Delete the first 4 entries. */ + SNAPSHOT(4 /* last entry */, 0 /* trailing */); + + /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */ + ASSERT(6 /* size */, + 4 /* front */, + 5 /* back */, + 4 /* offset */, + 1 /* n */); + + /* Append another 3 entries. */ + APPEND_MANY(1 /* term */, 3 /* n */); + + /* Now the log is [e7, e8, NULL, NULL, e5, e6] */ + ASSERT(6 /* size */, + 4 /* front */, + 2 /* back */, + 4 /* offset */, + 4 /* n */); + + /* Append another 3 entries. */ + APPEND_MANY(1 /* term */, 3 /* n */); + + /* Now the log is [e5, ..., e11, NULL, ..., NULL] */ + ASSERT(14 /* size */, + 0 /* front */, + 7 /* back */, + 4 /* offset */, + 7 /* n */); + + return MUNIT_OK; +} + +/* Append a batch of entries to an empty log. */ +TEST(logAppend, batch, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_BATCH(3); + ASSERT(6 /* size */, + 0 /* front */, + 3 /* back */, + 0 /* offset */, + 3 /* n */); + return MUNIT_OK; +} + +static char *logAppendOomHeapFaultDelay[] = {"0", "1", NULL}; +static char *logAppendOomHeapFaultRepeat[] = {"1", NULL}; + +static MunitParameterEnum logAppendOom[] = { + {TEST_HEAP_FAULT_DELAY, logAppendOomHeapFaultDelay}, + {TEST_HEAP_FAULT_REPEAT, logAppendOomHeapFaultRepeat}, + {NULL, NULL}, +}; + +/* Out of memory. */ +TEST(logAppend, oom, setUp, tearDown, 0, logAppendOom) +{ + struct fixture *f = data; + struct raft_buffer buf; + int rv; + buf.base = NULL; + buf.len = 0; + HeapFaultEnable(&f->heap); + rv = logAppend(f->log, 1, RAFT_COMMAND, &buf, NULL); + munit_assert_int(rv, ==, RAFT_NOMEM); + return MUNIT_OK; +} + +/* Out of memory when trying to grow the refs count table. */ +TEST(logAppend, oomRefs, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(1, LOG__REFS_INITIAL_SIZE); + HeapFaultConfig(&f->heap, 1, 1); + HeapFaultEnable(&f->heap); + APPEND_ERROR(1, RAFT_NOMEM); + return MUNIT_OK; +} + +/****************************************************************************** + * + * logAppendConfiguration + * + *****************************************************************************/ + +SUITE(logAppendConfiguration) + +static char *logAppendConfigurationOomHeapFaultDelay[] = {"0", "1", NULL}; +static char *logAppendConfigurationOomHeapFaultRepeat[] = {"1", NULL}; + +static MunitParameterEnum logAppendConfigurationOom[] = { + {TEST_HEAP_FAULT_DELAY, logAppendConfigurationOomHeapFaultDelay}, + {TEST_HEAP_FAULT_REPEAT, logAppendConfigurationOomHeapFaultRepeat}, + {NULL, NULL}, +}; + +/* Out of memory. */ +TEST(logAppendConfiguration, oom, setUp, tearDown, 0, logAppendConfigurationOom) +{ + struct fixture *f = data; + struct raft_configuration configuration; + int rv; + + configurationInit(&configuration); + rv = configurationAdd(&configuration, 1, "1", RAFT_VOTER); + munit_assert_int(rv, ==, 0); + + HeapFaultEnable(&f->heap); + + rv = logAppendConfiguration(f->log, 1, &configuration); + munit_assert_int(rv, ==, RAFT_NOMEM); + + configurationClose(&configuration); + + return MUNIT_OK; +} + +/****************************************************************************** + * + * logAcquire + * + *****************************************************************************/ + +SUITE(logAcquire) + +/* Acquire a single log entry. */ +TEST(logAcquire, one, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry *entries; + unsigned n; + APPEND(1 /* term */); + ACQUIRE(1 /* index */); + munit_assert_ptr_not_null(entries); + munit_assert_int(n, ==, 1); + munit_assert_int(entries[0].type, ==, RAFT_COMMAND); + ASSERT_REFCOUNT(1 /* index */, 2 /* count */); + RELEASE(1 /* index */); + ASSERT_REFCOUNT(1 /* index */, 1 /* count */); + return MUNIT_OK; +} + +/* Acquire two log entries. */ +TEST(logAcquire, two, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry *entries; + unsigned n; + APPEND(1 /* term */); + APPEND(1 /* term */); + ACQUIRE(1 /* index */); + munit_assert_ptr_not_null(entries); + munit_assert_int(n, ==, 2); + munit_assert_int(entries[0].type, ==, RAFT_COMMAND); + munit_assert_int(entries[1].type, ==, RAFT_COMMAND); + ASSERT_REFCOUNT(1 /* index */, 2 /* count */); + ASSERT_REFCOUNT(2 /* index */, 2 /* count */); + RELEASE(1 /* index */); + ASSERT_REFCOUNT(1 /* index */, 1 /* count */); + ASSERT_REFCOUNT(2 /* index */, 1 /* count */); + return MUNIT_OK; +} + +/* Acquire two log entries in a wrapped log. */ +TEST(logAcquire, wrap, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry *entries; + unsigned n; + + APPEND_MANY(1 /* term */, 5 /* n */); + + /* Now the log is [e1, e2, e3, e4, e5, NULL] */ + ASSERT(6 /* size */, + 0 /* front */, + 5 /* back */, + 0 /* offset */, + 5 /* n */); + + /* Delete the first 4 entries. */ + SNAPSHOT(4 /* last index */, 0 /* trailing */); + + /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */ + ASSERT(6 /* size */, + 4 /* front */, + 5 /* back */, + 4 /* offset */, + 1 /* n */); + + /* Append another 3 entries. */ + APPEND_MANY(1 /* term */, 3 /* n */); + + /* Now the log is [e7, e8, NULL, NULL, e5, e6] */ + ASSERT(6 /* size */, + 4 /* front */, + 2 /* back */, + 4 /* offset */, + 4 /* n */); + + ACQUIRE(6 /* index */); + munit_assert_int(n, ==, 3); + RELEASE(6 /* index */); + + return MUNIT_OK; +} + +/* Acquire several entries some of which belong to batches. */ +TEST(logAcquire, batch, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry *entries; + unsigned n; + + APPEND(1 /* term */); + APPEND_BATCH(2 /* n entries */); + APPEND(1 /* term */); + APPEND_BATCH(3 /* n entries */); + + ACQUIRE(2 /* index */); + munit_assert_ptr_not_null(entries); + munit_assert_int(n, ==, 6); + ASSERT_REFCOUNT(2 /* index */, 2 /* count */); + + /* Truncate the last 5 entries, so the only references left for the second + * batch are the ones in the acquired entries. */ + TRUNCATE(3 /* index */); + + RELEASE(2 /* index */); + + ASSERT_REFCOUNT(2 /* index */, 1 /* count */); + + return MUNIT_OK; +} + +/* Trying to acquire entries out of range results in a NULL pointer. */ +TEST(logAcquire, outOfRange, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry *entries; + unsigned n; + + APPEND(1 /* term */); + APPEND(1 /* term */); + SNAPSHOT(1 /* index */, 0 /* trailing */); + + ACQUIRE(1 /* index */); + munit_assert_ptr_null(entries); + ACQUIRE(3 /* index */); + munit_assert_ptr_null(entries); + + return MUNIT_OK; +} + +/* Out of memory. */ +TEST(logAcquire, oom, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry *entries; + unsigned n; + int rv; + + APPEND(1 /* term */); + + HeapFaultConfig(&f->heap, 0, 1); + HeapFaultEnable(&f->heap); + + rv = logAcquire(f->log, 1, &entries, &n); + munit_assert_int(rv, ==, RAFT_NOMEM); + + return MUNIT_OK; +} + +/****************************************************************************** + * + * logTruncate + * + *****************************************************************************/ + +SUITE(logTruncate) + +/* Truncate the last entry of a log with a single entry. */ +TEST(logTruncate, lastOfOne, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + + APPEND(1 /* term */); + TRUNCATE(1 /* index */); + + ASSERT(0 /* size */, + 0 /* front */, + 0 /* back */, + 0 /* offset */, + 0 /* n */); + + return MUNIT_OK; +} + +/* Truncate the last entry of a log with a two entries. */ +TEST(logTruncate, lastOfTwo, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + + APPEND(1 /* term */); + APPEND(1 /* term */); + + TRUNCATE(2 /* index */); + + ASSERT(6 /* size */, + 0 /* front */, + 1 /* back */, + 0 /* offset */, + 1 /* n */); + ASSERT_TERM_OF(1 /* entry index */, 1 /* term */); + + return MUNIT_OK; +} + +/* Truncate from an entry which makes the log wrap. */ +TEST(logTruncate, wrap, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + + APPEND_MANY(1 /* term */, 5 /* n entries */); + + /* Now the log is [e1, e2, e3, e4, e5, NULL] */ + ASSERT(6 /* size */, + 0 /* front */, + 5 /* back */, + 0 /* offset */, + 5 /* n */); + + /* Delete the first 4 entries. */ + SNAPSHOT(4 /* last index */, 0 /* trailing */); + + /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */ + ASSERT(6 /* size */, + 4 /* front */, + 5 /* back */, + 4 /* offset */, + 1 /* n */); + + /* Append another 3 entries. */ + APPEND_MANY(1 /* term */, 3 /* n entries */); + + /* Now the log is [e7, e8, NULL, NULL, e5, e6] */ + ASSERT(6 /* size */, + 4 /* front */, + 2 /* back */, + 4 /* offset */, + 4 /* n */); + + /* Truncate from e6 onward (wrapping) */ + TRUNCATE(6 /* index */); + + /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */ + ASSERT(6 /* size */, + 4 /* front */, + 5 /* back */, + 4 /* offset */, + 1 /* n */); + + return MUNIT_OK; +} + +/* Truncate the last entry of a log with a single entry, which still has an + * outstanding reference created by a call to logAcquire(). */ +TEST(logTruncate, referenced, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry *entries; + unsigned n; + + APPEND(1 /* term */); + ACQUIRE(1 /* index */); + TRUNCATE(1 /* index */); + + ASSERT(0 /* size */, + 0 /* front */, + 0 /* back */, + 0 /* offset */, + 0 /* n */); + + /* The entry has still an outstanding reference. */ + ASSERT_REFCOUNT(1 /* index */, 1 /* count */); + + munit_assert_string_equal((const char *)entries[0].buf.base, "hello"); + + RELEASE(1 /* index */); + ASSERT_REFCOUNT(1 /* index */, 0 /* count */); + + return MUNIT_OK; +} + +/* Truncate all entries belonging to a batch. */ +TEST(logTruncate, batch, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_BATCH(3 /* n entries */); + TRUNCATE(1 /* index */); + munit_assert_int(f->log->size, ==, 0); + return MUNIT_OK; +} + +/* Acquire entries at a certain index. Truncate the log at that index. The + * truncated entries are still referenced. Then append a new entry, which will + * have the same index but different term. */ +TEST(logTruncate, acquired, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry *entries; + unsigned n; + + APPEND(1 /* term */); + APPEND(1 /* term */); + ACQUIRE(2 /* index */); + munit_assert_int(n, ==, 1); + + TRUNCATE(2 /* index */); + + APPEND(2 /* term */); + + RELEASE(2 /*index */); + + return MUNIT_OK; +} + +/* Acquire some entries, truncate the log and then append new ones forcing the + log to be grown and the reference count hash table to be re-built. */ +TEST(logTruncate, acquireAppend, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + struct raft_entry *entries; + unsigned n; + size_t i; + + APPEND(1 /* term */); + APPEND(1 /* term */); + + ACQUIRE(2); + + munit_assert_int(n, ==, 1); + + TRUNCATE(2); + + for (i = 0; i < LOG__REFS_INITIAL_SIZE; i++) { + APPEND(2 /* term */); + } + + RELEASE(2); + + return MUNIT_OK; +} + +static char *logTruncateAcquiredHeapFaultDelay[] = {"0", NULL}; +static char *logTruncateAcquiredFaultRepeat[] = {"1", NULL}; + +static MunitParameterEnum logTruncateAcquiredOom[] = { + {TEST_HEAP_FAULT_DELAY, logTruncateAcquiredHeapFaultDelay}, + {TEST_HEAP_FAULT_REPEAT, logTruncateAcquiredFaultRepeat}, + {NULL, NULL}, +}; + +/* Acquire entries at a certain index. Truncate the log at that index. The + * truncated entries are still referenced. Then append a new entry, which fails + * to be appended due to OOM. */ +TEST(logTruncate, acquiredOom, setUp, tearDown, 0, logTruncateAcquiredOom) +{ + struct fixture *f = data; + struct raft_entry *entries; + unsigned n; + struct raft_buffer buf; + int rv; + + APPEND(1 /* term */); + APPEND(1 /* term */); + + ACQUIRE(2); + munit_assert_int(n, ==, 1); + + TRUNCATE(2); + + buf.base = NULL; + buf.len = 0; + + HeapFaultEnable(&f->heap); + + rv = logAppend(f->log, 2, RAFT_COMMAND, &buf, NULL); + munit_assert_int(rv, ==, RAFT_NOMEM); + + RELEASE(2); + + return MUNIT_OK; +} + +/****************************************************************************** + * + * logSnapshot + * + *****************************************************************************/ + +SUITE(logSnapshot) + +/* Take a snapshot at entry 3, keeping 2 trailing entries. */ +TEST(logSnapshot, trailing, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + + APPEND(1 /* term */); + APPEND(2 /* term */); + APPEND(2 /* term */); + + SNAPSHOT(3 /* last index */, 2 /* trailing */); + + ASSERT(6 /* size */, + 1 /* front */, + 3 /* back */, + 1 /* offset */, + 2 /* n */); + + ASSERT_SNAPSHOT(3 /* index */, 2 /* term */); + + munit_assert_int(NUM_ENTRIES, ==, 2); + munit_assert_int(LAST_INDEX, ==, 3); + + return MUNIT_OK; +} + +/* Take a snapshot when the number of outstanding entries is lower than the + * desired trail (so no entry will be deleted). */ +TEST(logSnapshot, trailingHigherThanNumEntries, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + + /* Take a snapshot leaving just one entry in the log. */ + APPEND_MANY(1 /* term */, 3 /* n entries */); + SNAPSHOT(3 /* last index */, 1 /* trailing */); + + /* Take another snapshot, trying to leave 3 entries, but only 2 are + * available at all. */ + APPEND(2 /* term */); + + SNAPSHOT(4 /* last index */, 3 /* trailing */); + + ASSERT(6 /* size */, + 2 /* front */, + 4 /* back */, + 2 /* offset */, + 2 /* n */); + + ASSERT_SNAPSHOT(4 /* index */, 2 /* term */); + + munit_assert_int(NUM_ENTRIES, ==, 2); + munit_assert_int(LAST_INDEX, ==, 4); + + return MUNIT_OK; +} + +/* Take a snapshot when the number of outstanding entries is exactly equal to + * the desired trail (so no entry will be deleted). */ +TEST(logSnapshot, trailingMatchesOutstanding, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + + /* Take a snapshot leaving just one entry in the log. */ + APPEND_MANY(1 /* term */, 3 /* n entries */); + SNAPSHOT(3 /* last index */, 1 /* trailing */); + + /* Take another snapshot, leaving 2 entries, which are the ones we have. */ + APPEND(2 /* term */); + + SNAPSHOT(4 /* last index */, 2 /* trailing */); + + ASSERT(6 /* size */, + 2 /* front */, + 4 /* back */, + 2 /* offset */, + 2 /* n */); + + ASSERT_SNAPSHOT(4 /* index */, 2 /* term */); + + munit_assert_int(NUM_ENTRIES, ==, 2); + munit_assert_int(LAST_INDEX, ==, 4); + + return MUNIT_OK; +} + +/* Take a snapshot at an index which is not the last one. */ +TEST(logSnapshot, lessThanHighestIndex, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + + /* Take a snapshot leaving three entries in the log. */ + APPEND_MANY(1 /* term */, 5 /* n entries */); + SNAPSHOT(4 /* last index */, 2 /* trailing */); + + ASSERT(6 /* size */, + 2 /* front */, + 5 /* back */, + 2 /* offset */, + 3 /* n */); + + ASSERT_SNAPSHOT(4 /* index */, 1 /* term */); + + munit_assert_int(NUM_ENTRIES, ==, 3); + munit_assert_int(LAST_INDEX, ==, 5); + + return MUNIT_OK; +} + +/* Take a snapshot at a point where the log needs to wrap. */ +TEST(logSnapshot, wrap, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + + APPEND_MANY(1 /* term */, 5 /* n entries */); + + /* Now the log is [e1, e2, e3, e4, e5, NULL] */ + ASSERT(6 /* size */, + 0 /* front */, + 5 /* back */, + 0 /* offset */, + 5 /* n */); + + /* Take a snapshot at e5, keeping just e5 itself. */ + SNAPSHOT(5 /* last index */, 1 /* trailing */); + + /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */ + ASSERT(6 /* size */, + 4 /* front */, + 5 /* back */, + 4 /* offset */, + 1 /* n */); + + ASSERT_SNAPSHOT(5 /* index */, 1 /* term */); + + /* Append another 4 entries. */ + APPEND_MANY(1 /* term */, 4 /* n */); + + /* Now the log is [e7, e8, e9, NULL, e5, e6] */ + ASSERT(6 /* size */, + 4 /* front */, + 3 /* back */, + 4 /* offset */, + 5 /* n */); + + /* Take a snapshot at e8 keeping only e8 itself (wrapping) */ + SNAPSHOT(8 /* last index */, 1 /* trailing */); + + /* Now the log is [NULL, e8, e9, NULL, NULL, NULL] */ + ASSERT(6 /* size */, + 1 /* front */, + 3 /* back */, + 7 /* offset */, + 2 /* n */); + + ASSERT_SNAPSHOT(8 /* index */, 1 /* term */); + + return MUNIT_OK; +} + +/****************************************************************************** + * + * logRestore + * + *****************************************************************************/ + +SUITE(logRestore) + +/* Mimic the initial restore of a snapshot after loading state from disk, when + * there are no outstanding entries. */ +TEST(logRestore, initial, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + RESTORE(2 /* last index */, 3 /* last term */); + ASSERT_SNAPSHOT(2 /* index */, 3 /* term */); + munit_assert_int(LAST_INDEX, ==, 2); + return MUNIT_OK; +} + +/* If there are existing entries they are wiped out. */ +TEST(logRestore, wipe, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + APPEND_MANY(1 /* term */, 5 /* n entries */); + RESTORE(2 /* last index */, 3 /* last term */); + ASSERT_SNAPSHOT(2 /* index */, 3 /* term */); + munit_assert_int(LAST_INDEX, ==, 2); + return MUNIT_OK; +} diff --git a/test/raft/unit/test_queue.c b/test/raft/unit/test_queue.c new file mode 100644 index 000000000..aee0f0a4d --- /dev/null +++ b/test/raft/unit/test_queue.c @@ -0,0 +1,260 @@ +#include "../../../src/raft/queue.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture with a single queue and a few test items that can be added to it. + * + *****************************************************************************/ + +struct item +{ + int value; + queue queue; +}; + +struct fixture +{ + queue queue; + struct item items[3]; +}; + +static void *setUp(MUNIT_UNUSED const MunitParameter params[], + MUNIT_UNUSED void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + QUEUE_INIT(&f->queue); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + free(f); +} + +/****************************************************************************** + * + * Helper macros + * + *****************************************************************************/ + +/* Initialize and push the given number of fixture items to the fixture's + * queue. Each item will have a value equal to its index plus one. */ +#define PUSH(N) \ + { \ + int i_; \ + for (i_ = 0; i_ < N; i_++) { \ + struct item *item_ = &f->items[i_]; \ + item_->value = i_ + 1; \ + QUEUE_PUSH(&f->queue, &item_->queue); \ + } \ + } + +/* Remove the i'th fixture item from the fixture queue. */ +#define REMOVE(I) QUEUE_REMOVE(&f->items[I].queue) + +/****************************************************************************** + * + * Assertions + * + *****************************************************************************/ + +/* Assert that the item at the head of the fixture's queue has the given + * value. */ +#define ASSERT_HEAD(VALUE) \ + { \ + queue *head_ = QUEUE_HEAD(&f->queue); \ + struct item *item_; \ + item_ = QUEUE_DATA(head_, struct item, queue); \ + munit_assert_int(item_->value, ==, VALUE); \ + } + +/* Assert that the item at the tail of the queue has the given value. */ +#define ASSERT_TAIL(VALUE) \ + { \ + queue *tail_ = QUEUE_TAIL(&f->queue); \ + struct item *item_; \ + item_ = QUEUE_DATA(tail_, struct item, queue); \ + munit_assert_int(item_->value, ==, VALUE); \ + } + +/* Assert that the fixture's queue is empty. */ +#define ASSERT_EMPTY munit_assert_true(QUEUE_IS_EMPTY(&f->queue)) + +/* Assert that the fixture's queue is not empty. */ +#define ASSERT_NOT_EMPTY munit_assert_false(QUEUE_IS_EMPTY(&f->queue)) + +/****************************************************************************** + * + * QUEUE_IS_EMPTY + * + *****************************************************************************/ + +SUITE(QUEUE_IS_EMPTY) + +TEST(QUEUE_IS_EMPTY, yes, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + ASSERT_EMPTY; + return MUNIT_OK; +} + +TEST(QUEUE_IS_EMPTY, no, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + PUSH(1); + ASSERT_NOT_EMPTY; + return MUNIT_OK; +} + +/****************************************************************************** + * + * QUEUE_PUSH + * + *****************************************************************************/ + +SUITE(QUEUE_PUSH) + +TEST(QUEUE_PUSH, one, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + PUSH(1); + ASSERT_HEAD(1); + return MUNIT_OK; +} + +TEST(QUEUE_PUSH, two, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + int i; + PUSH(2); + for (i = 0; i < 2; i++) { + ASSERT_HEAD(i + 1); + REMOVE(i); + } + ASSERT_EMPTY; + return MUNIT_OK; +} + +/****************************************************************************** + * + * QUEUE_REMOVE + * + *****************************************************************************/ + +SUITE(QUEUE_REMOVE) + +TEST(QUEUE_REMOVE, first, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + PUSH(3); + REMOVE(0); + ASSERT_HEAD(2); + return MUNIT_OK; +} + +TEST(QUEUE_REMOVE, second, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + PUSH(3); + REMOVE(1); + ASSERT_HEAD(1); + return MUNIT_OK; +} + +TEST(QUEUE_REMOVE, success, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + PUSH(3); + REMOVE(2); + ASSERT_HEAD(1); + return MUNIT_OK; +} + +/****************************************************************************** + * + * QUEUE_TAIL + * + *****************************************************************************/ + +SUITE(QUEUE_TAIL) + +TEST(QUEUE_TAIL, one, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + PUSH(1); + ASSERT_TAIL(1); + return MUNIT_OK; +} + +TEST(QUEUE_TAIL, two, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + PUSH(2); + ASSERT_TAIL(2); + return MUNIT_OK; +} + +TEST(QUEUE_TAIL, three, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + PUSH(3); + ASSERT_TAIL(3); + return MUNIT_OK; +} + +/****************************************************************************** + * + * QUEUE_FOREACH + * + *****************************************************************************/ + +SUITE(QUEUE_FOREACH) + +/* Loop through a queue of zero items. */ +TEST(QUEUE_FOREACH, zero, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + queue *head; + int count = 0; + QUEUE_FOREACH (head, &f->queue) { + count++; + } + munit_assert_int(count, ==, 0); + return MUNIT_OK; +} + +/* Loop through a queue of one item. */ +TEST(QUEUE_FOREACH, one, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + queue *head; + int count = 0; + PUSH(1); + QUEUE_FOREACH (head, &f->queue) { + count++; + } + munit_assert_int(count, ==, 1); + return MUNIT_OK; +} + +/* Loop through a queue of two items. The order of the loop is from the head to + * the tail. */ +TEST(QUEUE_FOREACH, two, setUp, tearDown, 0, NULL) +{ + struct fixture *f = data; + queue *head; + int values[2] = {0, 0}; + int i = 0; + PUSH(2); + QUEUE_FOREACH (head, &f->queue) { + struct item *item; + item = QUEUE_DATA(head, struct item, queue); + values[i] = item->value; + i++; + } + munit_assert_int(values[0], ==, 1); + munit_assert_int(values[1], ==, 2); + return MUNIT_OK; +} diff --git a/test/raft/unit/test_uv_fs.c b/test/raft/unit/test_uv_fs.c new file mode 100644 index 000000000..a72206dc9 --- /dev/null +++ b/test/raft/unit/test_uv_fs.c @@ -0,0 +1,473 @@ +#include + +#include "../../../src/raft/uv_fs.h" +#include "../../../src/raft/uv_os.h" +#include "../lib/aio.h" +#include "../lib/dir.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * UvFsCheckDir + * + *****************************************************************************/ + +/* Invoke UvFsCheckDir passing it the given dir. */ +#define CHECK_DIR(DIR) \ + { \ + int _rv; \ + char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \ + _rv = UvFsCheckDir(DIR, _errmsg); \ + munit_assert_int(_rv, ==, 0); \ + } + +/* Invoke UvFsCheckDir passing it the given dir and check that the given error + * occurs. */ +#define CHECK_DIR_ERROR(DIR, RV, ERRMSG) \ + { \ + int _rv; \ + char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \ + _rv = UvFsCheckDir(DIR, _errmsg); \ + munit_assert_int(_rv, ==, RV); \ + munit_assert_string_equal(_errmsg, ERRMSG); \ + } + +SUITE(UvFsCheckDir) + +/* If the directory exists, the function succeeds. */ +TEST(UvFsCheckDir, exists, DirSetUp, DirTearDown, 0, NULL) +{ + const char *dir = data; + CHECK_DIR(dir); + return MUNIT_OK; +} + +/* If the directory doesn't exist, it an error is returned. */ +TEST(UvFsCheckDir, doesNotExist, DirSetUp, DirTearDown, 0, NULL) +{ + const char *parent = data; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + char dir[128]; + sprintf(errmsg, "%s/sub", parent); + sprintf(errmsg, "directory '%s' does not exist", dir); + CHECK_DIR_ERROR(dir, RAFT_NOTFOUND, errmsg); + return MUNIT_OK; +} + +/* If the process can't access the directory, an error is returned. */ +TEST(UvFsCheckDir, permissionDenied, NULL, NULL, 0, NULL) +{ + bool has_access = DirHasFile("/proc/1", "root"); + /* Skip the test is the process actually has access to /proc/1/root. */ + if (has_access) { + return MUNIT_SKIP; + } + CHECK_DIR_ERROR("/proc/1/root", RAFT_UNAUTHORIZED, + "can't access directory '/proc/1/root'"); + return MUNIT_OK; +} + +/* If the given path contains a non-directory prefix, an error is returned. */ +TEST(UvFsCheckDir, notDirPrefix, NULL, NULL, 0, NULL) +{ + CHECK_DIR_ERROR("/dev/null/foo", RAFT_INVALID, + "path '/dev/null/foo' is not a directory"); + return MUNIT_OK; +} + +/* If the given path is not a directory, an error is returned. */ +TEST(UvFsCheckDir, notDir, NULL, NULL, 0, NULL) +{ + CHECK_DIR_ERROR("/dev/null", RAFT_INVALID, + "path '/dev/null' is not a directory"); + return MUNIT_OK; +} + +/* If the given directory is not writable, an error is returned. */ +TEST(UvFsCheckDir, notWritable, DirSetUp, DirTearDown, 0, NULL) +{ + const char *dir = data; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + sprintf(errmsg, "directory '%s' is not writable", dir); + DirMakeUnwritable(dir); + CHECK_DIR_ERROR(dir, RAFT_INVALID, errmsg); + return MUNIT_OK; +} + +/****************************************************************************** + * + * UvFsSyncDir + * + *****************************************************************************/ + +/* Invoke UvFsSyncDir passing it the given dir. */ +#define SYNC_DIR_ERROR(DIR, RV, ERRMSG) \ + { \ + char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \ + munit_assert_int(UvFsSyncDir(DIR, _errmsg), ==, RV); \ + munit_assert_string_equal(_errmsg, ERRMSG); \ + } + +SUITE(UvFsSyncDir) + +/* If the directory doesn't exist, an error is returned. */ +TEST(UvFsSyncDir, noExists, NULL, NULL, 0, NULL) +{ + SYNC_DIR_ERROR("/abcdef", RAFT_IOERR, + "open directory: no such file or directory"); + return MUNIT_OK; +} + +/****************************************************************************** + * + * UvFsOpenFileForReading + * + *****************************************************************************/ + +/* Open a file in the given dir. */ +#define OPEN_FILE_FOR_READING_ERROR(DIR, FILENAME, RV, ERRMSG) \ + { \ + uv_file fd_; \ + char errmsg_[RAFT_ERRMSG_BUF_SIZE]; \ + int rv_ = UvFsOpenFileForReading(DIR, FILENAME, &fd_, errmsg_); \ + munit_assert_int(rv_, ==, RV); \ + munit_assert_string_equal(errmsg_, ERRMSG); \ + } + +SUITE(UvFsOpenFileForReading) + +/* If the directory doesn't exist, an error is returned. */ +TEST(UvFsOpenFileForReading, noExists, DirSetUp, DirTearDown, 0, NULL) +{ + const char *dir = data; + OPEN_FILE_FOR_READING_ERROR(dir, "foo", RAFT_IOERR, + "open: no such file or directory"); + return MUNIT_OK; +} + +/****************************************************************************** + * + * UvFsAllocateFile + * + *****************************************************************************/ + +#define FALLOCATE_PARAM "fallocate" +static char *fallocate_params[] = {"1", "0", NULL}; +MunitParameterEnum fallocateParams[] = { + {FALLOCATE_PARAM, fallocate_params}, + {NULL, NULL}, +}; + +/* Allocate a file with the given parameters and assert that no error occurred. + */ +#define ALLOCATE_FILE(DIR, FILENAME, SIZE) \ + { \ + uv_file fd_; \ + char errmsg_; \ + int rv_; \ + bool fallocate_ = true; \ + const char *f = munit_parameters_get(params, FALLOCATE_PARAM); \ + if (f != NULL) { \ + fallocate_ = atoi(f); \ + } \ + rv_ = \ + UvFsAllocateFile(DIR, FILENAME, SIZE, &fd_, fallocate_, &errmsg_); \ + munit_assert_int(rv_, ==, 0); \ + munit_assert_int(UvOsClose(fd_), ==, 0); \ + } + +/* Assert that creating a file with the given parameters fails with the given + * code and error message. */ +#define ALLOCATE_FILE_ERROR(DIR, FILENAME, SIZE, RV, ERRMSG) \ + { \ + uv_file fd_; \ + char errmsg_[RAFT_ERRMSG_BUF_SIZE]; \ + int rv_; \ + bool fallocate_ = true; \ + const char *f = munit_parameters_get(params, FALLOCATE_PARAM); \ + if (f != NULL) { \ + fallocate_ = atoi(f); \ + } \ + rv_ = \ + UvFsAllocateFile(DIR, FILENAME, SIZE, &fd_, fallocate_, errmsg_); \ + munit_assert_int(rv_, ==, RV); \ + munit_assert_string_equal(errmsg_, ERRMSG); \ + } + +SUITE(UvFsAllocateFile) + +/* If the given path is valid, the file gets created. */ +TEST(UvFsAllocateFile, success, DirSetUp, DirTearDown, 0, fallocateParams) +{ + const char *dir = data; + ALLOCATE_FILE(dir, /* dir */ + "foo", /* filename */ + 4096 /* size */); + munit_assert_true(DirHasFile(dir, "foo")); + return MUNIT_OK; +} + +/* The directory of given path does not exist, an error is returned. */ +TEST(UvFsAllocateFile, dirNoExists, NULL, NULL, 0, fallocateParams) +{ + ALLOCATE_FILE_ERROR("/non/existing/dir", /* dir */ + "foo", /* filename */ + 64, /* size */ + RAFT_IOERR, /* status */ + "open: no such file or directory"); + return MUNIT_OK; +} + +/* If the given path already exists, an error is returned. */ +TEST(UvFsAllocateFile, + fileAlreadyExists, + DirSetUp, + DirTearDown, + 0, + fallocateParams) +{ + const char *dir = data; + char buf[8] = {0}; + DirWriteFile(dir, "foo", buf, sizeof buf); + ALLOCATE_FILE_ERROR(dir, /* dir */ + "foo", /* filename */ + 64, /* size */ + RAFT_IOERR, /* status */ + "open: file already exists"); + return MUNIT_OK; +} + +static char *dirTmpfs_params[] = {"tmpfs", NULL}; + +MunitParameterEnum noSpaceParams[] = { + {DIR_FS_PARAM, dirTmpfs_params}, + {"fallocate", fallocate_params}, + {NULL, NULL}, +}; + +/* The file system has run out of space. */ +TEST(UvFsAllocateFile, noSpace, DirSetUp, DirTearDown, 0, noSpaceParams) +{ + const char *dir = data; + if (dir == NULL) { + return MUNIT_SKIP; + } + ALLOCATE_FILE_ERROR(dir, /* dir */ + "foo", /* filename */ + 4096 * 32768, /* size */ + RAFT_NOSPACE, /* status */ + "not enough space to allocate 134217728 bytes"); + munit_assert_false(DirHasFile(dir, "foo")); + return MUNIT_OK; +} + +/****************************************************************************** + * + * UvFsProbeCapabilities + * + *****************************************************************************/ + +/* Invoke UvFsProbeCapabilities against the given dir and assert that it returns + * the given values for direct I/O and async I/O. */ +#define PROBE_CAPABILITIES(DIR, DIRECT_IO, ASYNC_IO, FALLOCATE) \ + { \ + size_t direct_io_; \ + bool async_io_; \ + bool fallocate_; \ + char errmsg_[RAFT_ERRMSG_BUF_SIZE]; \ + int rv_; \ + rv_ = UvFsProbeCapabilities(DIR, &direct_io_, &async_io_, &fallocate_, \ + errmsg_); \ + munit_assert_int(rv_, ==, 0); \ + munit_assert_size(direct_io_, ==, DIRECT_IO); \ + munit_assert_int(fallocate_, ==, FALLOCATE); \ + if (ASYNC_IO) { \ + munit_assert_true(async_io_); \ + } else { \ + munit_assert_false(async_io_); \ + } \ + } + +/* Invoke UvFsProbeCapabilities and check that the given error occurs. */ +#define PROBE_CAPABILITIES_ERROR(DIR, RV, ERRMSG) \ + { \ + size_t direct_io_; \ + bool async_io_; \ + bool fallocate_; \ + char errmsg_[RAFT_ERRMSG_BUF_SIZE]; \ + int rv_; \ + rv_ = UvFsProbeCapabilities(DIR, &direct_io_, &async_io_, &fallocate_, \ + errmsg_); \ + munit_assert_int(rv_, ==, RV); \ + munit_assert_string_equal(errmsg_, ERRMSG); \ + } + +SUITE(UvFsProbeCapabilities) + +TEST(UvFsProbeCapabilities, tmpfs, DirTmpfsSetUp, DirTearDown, 0, NULL) +{ + const char *dir = data; + if (dir == NULL) { + return MUNIT_SKIP; + } + PROBE_CAPABILITIES(dir, 0, false, true); + return MUNIT_OK; +} + +/* ZFS 0.8 reports that it supports direct I/O, but does not support fully + * support asynchronous kernel AIO. */ +TEST(UvFsProbeCapabilities, zfsDirectIO, DirZfsSetUp, DirTearDown, 0, NULL) +{ + const char *dir = data; + size_t direct_io = 0; +#if defined(RAFT_HAVE_ZFS_WITH_DIRECT_IO) + direct_io = 4096; +#endif + if (dir == NULL) { + return MUNIT_SKIP; + } + PROBE_CAPABILITIES(dir, direct_io, false, true); + return MUNIT_OK; +} + +/* File systems that fully support DIO. */ +TEST(UvFsProbeCapabilities, aio, DirSetUp, DirTearDown, 0, DirAioParams) +{ + const char *dir = data; + if (dir == NULL) { + return MUNIT_SKIP; + } + /* FIXME: btrfs doesn't like that we perform a first write to the probe file + * to detect the direct I/O buffer size. */ + if (strcmp(munit_parameters_get(params, DIR_FS_PARAM), "btrfs") == 0) { + return MUNIT_SKIP; + } + PROBE_CAPABILITIES(dir, 4096, true, true); + return MUNIT_OK; +} + +/* If the given path is not executable, the block size of the underlying file + * system can't be determined and an error is returned. */ +TEST(UvFsProbeCapabilities, noAccess, DirSetUp, DirTearDown, 0, NULL) +{ + const char *dir = data; + + /* Skip the test when running as root, since EACCES would not be triggered + * in that case. */ + if (getuid() == 0) { + return MUNIT_SKIP; + } + + DirMakeUnexecutable(dir); + PROBE_CAPABILITIES_ERROR( + dir, RAFT_IOERR, + "create I/O capabilities probe file: open: permission denied"); + + return MUNIT_OK; +} + +/* No space is left on the target device. */ +TEST(UvFsProbeCapabilities, noSpace, DirTmpfsSetUp, DirTearDown, 0, NULL) +{ + const char *dir = data; + if (dir == NULL) { + return MUNIT_SKIP; + } + DirFill(dir, 0); + PROBE_CAPABILITIES_ERROR(dir, RAFT_NOSPACE, + "create I/O capabilities probe file: not enough " + "space to allocate 4096 bytes"); + return MUNIT_OK; +} + +/* The uvIoSetup() call fails with EAGAIN. */ +TEST(UvFsProbeCapabilities, noResources, DirBtrfsSetUp, DirTearDown, 0, NULL) +{ + const char *dir = data; + aio_context_t ctx = 0; + int rv; + if (dir == NULL) { + return MUNIT_SKIP; + } + rv = AioFill(&ctx, 0); + if (rv != 0) { + return MUNIT_SKIP; + } + PROBE_CAPABILITIES_ERROR( + dir, RAFT_IOERR, + "probe Async I/O: io_setup: resource temporarily unavailable"); + AioDestroy(ctx); + return MUNIT_OK; +} + +/****************************************************************************** + * + * UvFsMakeFile + * + *****************************************************************************/ + +SUITE(UvFsMakeFile) + +/* If the file does not exist, the function succeeds. */ +TEST(UvFsMakeFile, notExists, DirSetUp, DirTearDown, 0, NULL) +{ + const char *dir = data; + int rv; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + struct raft_buffer bufs[2] = {{0}, {0}}; + rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg); + munit_assert_int(rv, ==, 0); + return MUNIT_OK; +} + +/* If the file exists, the function does not succeed. */ +TEST(UvFsMakeFile, exists, DirSetUp, DirTearDown, 0, NULL) +{ + const char *dir = data; + int rv; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + struct raft_buffer bufs[2] = {{0}, {0}}; + rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg); + munit_assert_int(rv, ==, 0); + rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg); + munit_assert_int(rv, !=, 0); + return MUNIT_OK; +} + +/****************************************************************************** + * + * UvFsRenameFile + * + *****************************************************************************/ + +SUITE(UvFsRenameFile) + +TEST(UvFsRenameFile, rename, DirSetUp, DirTearDown, 0, NULL) +{ + const char *dir = data; + int rv; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + struct raft_buffer bufs[2] = {{0}, {0}}; + rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg); + munit_assert_int(rv, ==, 0); + rv = UvFsRenameFile(dir, "foo", "bar", errmsg); + munit_assert_int(rv, ==, 0); + munit_assert_false(DirHasFile(dir, "foo")); + munit_assert_true(DirHasFile(dir, "bar")); + return MUNIT_OK; +} + +/* rename to same name */ +TEST(UvFsRenameFile, same, DirSetUp, DirTearDown, 0, NULL) +{ + const char *dir = data; + int rv; + char errmsg[RAFT_ERRMSG_BUF_SIZE]; + struct raft_buffer bufs[2] = {{0}, {0}}; + rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg); + munit_assert_int(rv, ==, 0); + rv = UvFsRenameFile(dir, "foo", "foo", errmsg); + munit_assert_int(rv, ==, 0); + munit_assert_true(DirHasFile(dir, "foo")); + return MUNIT_OK; +} diff --git a/test/raft/unit/test_uv_os.c b/test/raft/unit/test_uv_os.c new file mode 100644 index 000000000..ebbe5f7f1 --- /dev/null +++ b/test/raft/unit/test_uv_os.c @@ -0,0 +1,85 @@ +#include "../../../src/raft/uv_os.h" +#include "../lib/runner.h" + +SUITE(UvOsJoin) + +/* dir and filename have sensible lengths */ +TEST(UvOsJoin, basic, NULL, NULL, 0, NULL) +{ + int rv; + const char *dir = "/home"; + const char *filename = "testfile"; + char path[UV__PATH_SZ]; + rv = UvOsJoin(dir, filename, path); + munit_assert_int(rv, ==, 0); + munit_assert_string_equal(path, "/home/testfile"); + return MUNIT_OK; +} + +TEST(UvOsJoin, dirTooLong, NULL, NULL, 0, NULL) +{ + int rv; + char path[UV__PATH_SZ]; + char dir[UV__DIR_LEN + 2]; /* Room for '\0' and then 1 char over limit. */ + memset((char *)dir, '/', sizeof(dir)); + dir[sizeof(dir) - 1] = '\0'; + const char *filename = "testfile"; + + rv = UvOsJoin(dir, filename, path); + munit_assert_int(rv, !=, 0); + return MUNIT_OK; +} + +TEST(UvOsJoin, filenameTooLong, NULL, NULL, 0, NULL) +{ + int rv; + char path[UV__PATH_SZ]; + const char *dir = "testdir"; + char filename[UV__FILENAME_LEN + 2]; + memset((char *)filename, 'a', sizeof(filename)); + filename[sizeof(filename) - 1] = '\0'; + + rv = UvOsJoin(dir, filename, path); + munit_assert_int(rv, !=, 0); + return MUNIT_OK; +} + +TEST(UvOsJoin, dirAndFilenameTooLong, NULL, NULL, 0, NULL) +{ + int rv; + /* +2 to silence compilers that complain that dir & filename would overflow + * path, but it's strictly not needed and doesn't influence the test. */ + char path[UV__PATH_SZ + 2]; + char dir[UV__DIR_LEN + 2]; + memset((char *)dir, '/', sizeof(dir)); + dir[sizeof(dir) - 1] = '\0'; + + char filename[UV__FILENAME_LEN + 2]; + memset((char *)filename, 'a', sizeof(filename)); + filename[sizeof(filename) - 1] = '\0'; + + rv = UvOsJoin(dir, filename, path); + munit_assert_int(rv, !=, 0); + return MUNIT_OK; +} + +TEST(UvOsJoin, dirAndFilenameMax, NULL, NULL, 0, NULL) +{ + int rv; + char path[UV__PATH_SZ]; + char dir[UV__DIR_LEN + 1]; + memset((char *)dir, '/', sizeof(dir)); + dir[sizeof(dir) - 1] = '\0'; + + char filename[UV__FILENAME_LEN + 1]; + memset((char *)filename, 'a', sizeof(filename)); + filename[sizeof(filename) - 1] = '\0'; + + rv = UvOsJoin(dir, filename, path); + munit_assert_int(rv, ==, 0); + char cmp_path[UV__DIR_LEN + UV__FILENAME_LEN + 1 + 1]; + snprintf(cmp_path, UV__DIR_LEN + UV__FILENAME_LEN + 1 + 1, "%s/%s", dir, + filename); + munit_assert_string_equal(path, cmp_path); + return MUNIT_OK; +} diff --git a/test/raft/unit/test_uv_writer.c b/test/raft/unit/test_uv_writer.c new file mode 100644 index 000000000..27ac4d665 --- /dev/null +++ b/test/raft/unit/test_uv_writer.c @@ -0,0 +1,391 @@ +#include "../../../src/raft/uv_fs.h" +#include "../../../src/raft/uv_writer.h" +#include "../lib/aio.h" +#include "../lib/dir.h" +#include "../lib/loop.h" +#include "../lib/runner.h" + +/****************************************************************************** + * + * Fixture with a UvWriter and an open file ready for writing. + * + *****************************************************************************/ + +struct fixture +{ + FIXTURE_DIR; + FIXTURE_LOOP; + int fd; + size_t block_size; + size_t direct_io; + bool fallocate; + bool async_io; + char errmsg[256]; + struct UvWriter writer; + bool closed; +}; + +/****************************************************************************** + * + * Helper macros. + * + *****************************************************************************/ + +struct result +{ + int status; + bool done; +}; + +static void closeCb(struct UvWriter *writer) +{ + struct fixture *f = writer->data; + f->closed = true; +} + +static void submitCbAssertResult(struct UvWriterReq *req, int status) +{ + struct result *result = req->data; + munit_assert_int(status, ==, result->status); + result->done = true; +} + +/* Initialize the fixture's writer. */ +#define INIT(MAX_WRITES) \ + do { \ + int _rv; \ + _rv = UvWriterInit(&f->writer, &f->loop, f->fd, f->direct_io != 0, \ + f->async_io, MAX_WRITES, f->errmsg); \ + munit_assert_int(_rv, ==, 0); \ + f->writer.data = f; \ + f->closed = false; \ + } while (0) + +/* Try to initialize the fixture's writer and check that the given error is + * returned. */ +#define INIT_ERROR(RV, ERRMSG) \ + do { \ + int _rv; \ + _rv = UvWriterInit(&f->writer, &f->loop, f->fd, f->direct_io != 0, \ + f->async_io, 1, f->errmsg); \ + munit_assert_int(_rv, ==, RV); \ + munit_assert_string_equal(f->errmsg, ERRMSG); \ + } while (0) + +/* Close helper. */ +#define CLOSE_SUBMIT \ + munit_assert_false(f->closed); \ + UvWriterClose(&f->writer, closeCb); \ + munit_assert_false(f->closed) +#define CLOSE_WAIT LOOP_RUN_UNTIL(&f->closed) +#define CLOSE \ + CLOSE_SUBMIT; \ + CLOSE_WAIT + +#define MAKE_BUFS(BUFS, N_BUFS, CONTENT) \ + { \ + int __i; \ + BUFS = munit_malloc(sizeof *BUFS * N_BUFS); \ + for (__i = 0; __i < N_BUFS; __i++) { \ + uv_buf_t *__buf = &BUFS[__i]; \ + __buf->len = f->block_size; \ + __buf->base = aligned_alloc(f->block_size, f->block_size); \ + munit_assert_ptr_not_null(__buf->base); \ + memset(__buf->base, CONTENT + __i, __buf->len); \ + } \ + } + +#define DESTROY_BUFS(BUFS, N_BUFS) \ + { \ + int __i; \ + for (__i = 0; __i < N_BUFS; __i++) { \ + free(BUFS[__i].base); \ + } \ + free(BUFS); \ + } + +#define WRITE_REQ(N_BUFS, CONTENT, OFFSET, RV, STATUS) \ + struct uv_buf_t *_bufs; \ + struct UvWriterReq _req; \ + struct result _result = {STATUS, false}; \ + int _rv; \ + MAKE_BUFS(_bufs, N_BUFS, CONTENT); \ + _req.data = &_result; \ + _rv = UvWriterSubmit(&f->writer, &_req, _bufs, N_BUFS, OFFSET, \ + submitCbAssertResult); \ + munit_assert_int(_rv, ==, RV); + +/* Submit a write request with the given parameters and wait for the operation + * to successfully complete. Deallocate BUFS when done. + * + * N_BUFS is the number of buffers to allocate and write, each of them will have + * f->block_size bytes. + * + * CONTENT must be an unsigned byte value: all bytes of the first buffer will be + * filled with that value, all bytes of the second buffer will be filled will + * that value plus one, etc. + * + * OFFSET is the offset at which to write the buffers. */ +#define WRITE(N_BUFS, CONTENT, OFFSET) \ + do { \ + WRITE_REQ(N_BUFS, CONTENT, OFFSET, 0 /* rv */, 0 /* status */); \ + LOOP_RUN_UNTIL(&_result.done); \ + DESTROY_BUFS(_bufs, N_BUFS); \ + } while (0) + +/* Submit a write request with the given parameters and wait for the operation + * to fail with the given code and message. */ +#define WRITE_FAILURE(N_BUFS, CONTENT, OFFSET, STATUS, ERRMSG) \ + do { \ + WRITE_REQ(N_BUFS, CONTENT, OFFSET, 0 /* rv */, STATUS); \ + LOOP_RUN_UNTIL(&_result.done); \ + munit_assert_string_equal(f->writer.errmsg, ERRMSG); \ + DESTROY_BUFS(_bufs, N_BUFS); \ + } while (0) + +/* Submit a write request with the given parameters, close the writer right + * after and assert that the request got canceled. */ +#define WRITE_CLOSE(N_BUFS, CONTENT, OFFSET, STATUS) \ + do { \ + WRITE_REQ(N_BUFS, CONTENT, OFFSET, 0 /* rv */, STATUS); \ + CLOSE_SUBMIT; \ + munit_assert_false(_result.done); \ + LOOP_RUN_UNTIL(&_result.done); \ + DESTROY_BUFS(_bufs, N_BUFS); \ + CLOSE_WAIT; \ + } while (0) + +/* Assert that the content of the test file has the given number of blocks, each + * filled with progressive numbers. */ +#define ASSERT_CONTENT(N) \ + do { \ + size_t _size = N * f->block_size; \ + void *_buf = munit_malloc(_size); \ + unsigned _i; \ + unsigned _j; \ + \ + DirReadFile(f->dir, "foo", _buf, _size); \ + \ + for (_i = 0; _i < N; _i++) { \ + char *cursor = (char *)_buf + _i * f->block_size; \ + for (_j = 0; _j < f->block_size; _j++) { \ + munit_assert_int(cursor[_j], ==, _i + 1); \ + } \ + } \ + \ + free(_buf); \ + } while (0) + +#define N_BLOCKS 5 + +/****************************************************************************** + * + * Set up and tear down. + * + *****************************************************************************/ + +static void *setUpDeps(const MunitParameter params[], void *user_data) +{ + struct fixture *f = munit_malloc(sizeof *f); + char path[UV__PATH_SZ]; + char errmsg[256]; + int rv; + SET_UP_DIR; + SETUP_LOOP; + rv = UvFsProbeCapabilities(f->dir, &f->direct_io, &f->async_io, + &f->fallocate, errmsg); + munit_assert_int(rv, ==, 0); + f->block_size = f->direct_io != 0 ? f->direct_io : 4096; + rv = UvOsJoin(f->dir, "foo", path); + munit_assert_int(rv, ==, 0); + rv = UvOsOpen(path, O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR, &f->fd); + munit_assert_int(rv, ==, 0); + rv = UvOsFallocate(f->fd, 0, f->block_size * N_BLOCKS); + munit_assert_int(rv, ==, 0); + return f; +} + +static void tearDownDeps(void *data) +{ + struct fixture *f = data; + if (f == NULL) { + return; /* Was skipped. */ + } + UvOsClose(f->fd); + TEAR_DOWN_LOOP; + TEAR_DOWN_DIR; + free(f); +} + +static void *setUp(const MunitParameter params[], void *user_data) +{ + struct fixture *f = setUpDeps(params, user_data); + if (f == NULL) { + return NULL; + } + INIT(1); + return f; +} + +static void tearDown(void *data) +{ + struct fixture *f = data; + if (f == NULL) { + return; /* Was skipped. */ + } + CLOSE; + tearDownDeps(f); +} + +/****************************************************************************** + * + * UvWriterInit + * + *****************************************************************************/ + +SUITE(UvWriterInit) + +/* The kernel has ran out of available AIO events. */ +TEST(UvWriterInit, noResources, setUpDeps, tearDownDeps, 0, NULL) +{ + struct fixture *f = data; + aio_context_t ctx = 0; + int rv; + rv = AioFill(&ctx, 0); + if (rv != 0) { + return MUNIT_SKIP; + } + INIT_ERROR(RAFT_TOOMANY, "AIO events user limit exceeded"); + AioDestroy(ctx); + return MUNIT_OK; +} + +/****************************************************************************** + * + * UvWriterSubmit + * + *****************************************************************************/ + +SUITE(UvWriterSubmit) + +TEST(UvWriterSubmit, one, setUp, tearDown, 0, DirAllParams) +{ + struct fixture *f = data; + SKIP_IF_NO_FIXTURE; + WRITE(1 /* n bufs */, 1 /* content */, 0 /* offset */); + ASSERT_CONTENT(1); + return MUNIT_OK; +} + +/* Write two buffers, one after the other. */ +TEST(UvWriterSubmit, two, setUp, tearDown, 0, DirAllParams) +{ + struct fixture *f = data; + SKIP_IF_NO_FIXTURE; + WRITE(1 /* n bufs */, 1 /* content */, 0 /* offset */); + WRITE(1 /* n bufs */, 2 /* content */, f->block_size /* offset */); + ASSERT_CONTENT(2); + return MUNIT_OK; +} + +/* Write the same block twice. */ +TEST(UvWriterSubmit, twice, setUp, tearDown, 0, DirAllParams) +{ + struct fixture *f = data; + SKIP_IF_NO_FIXTURE; + WRITE(1 /* n bufs */, 0 /* content */, 0 /* offset */); + WRITE(1 /* n bufs */, 1 /* content */, 0 /* offset */); + ASSERT_CONTENT(1); + return MUNIT_OK; +} + +/* Write a vector of buffers. */ +TEST(UvWriterSubmit, vec, setUp, tearDown, 0, DirAllParams) +{ + struct fixture *f = data; + SKIP_IF_NO_FIXTURE; + WRITE(2 /* n bufs */, 1 /* content */, 0 /* offset */); + ASSERT_CONTENT(1); + return MUNIT_OK; +} + +/* Write a vector of buffers twice. */ +TEST(UvWriterSubmit, vecTwice, setUp, tearDown, 0, DirAllParams) +{ + struct fixture *f = data; + SKIP_IF_NO_FIXTURE; + WRITE(2 /* n bufs */, 1 /* content */, 0 /* offset */); + WRITE(2 /* n bufs */, 1 /* content */, 0 /* offset */); + ASSERT_CONTENT(2); + return MUNIT_OK; +} + +/* Write past the allocated space. */ +TEST(UvWriterSubmit, beyondEOF, setUp, tearDown, 0, DirAllParams) +{ + struct fixture *f = data; + int i; + SKIP_IF_NO_FIXTURE; + for (i = 0; i < N_BLOCKS + 1; i++) { + WRITE(1 /* n bufs */, i + 1 /* content */, + i * f->block_size /* offset */); + } + ASSERT_CONTENT((N_BLOCKS + 1)); + return MUNIT_OK; +} + +/* Write two different blocks concurrently. */ +TEST(UvWriterSubmit, concurrent, NULL, NULL, 0, DirAllParams) +{ + return MUNIT_SKIP; /* TODO: tests stop responding */ +} + +/* Write the same block concurrently. */ +TEST(UvWriterSubmit, concurrentSame, NULL, NULL, 0, DirAllParams) +{ + return MUNIT_SKIP; /* TODO: tests stop responding */ +} + +/* There are not enough resources to create an AIO context to perform the + * write. */ +TEST(UvWriterSubmit, noResources, setUpDeps, tearDown, 0, DirNoAioParams) +{ + struct fixture *f = data; + aio_context_t ctx = 0; + int rv; + SKIP_IF_NO_FIXTURE; + INIT(2); + rv = AioFill(&ctx, 0); + if (rv != 0) { + return MUNIT_SKIP; + } + WRITE_FAILURE(1, 0, 0, RAFT_TOOMANY, "AIO events user limit exceeded"); + AioDestroy(ctx); + return MUNIT_OK; +} + +/****************************************************************************** + * + * UvWriterSubmit + * + *****************************************************************************/ + +SUITE(UvWriterClose) + +/* Close with an inflight write running in the threadpool. */ +TEST(UvWriterClose, threadpool, setUp, tearDownDeps, 0, DirNoAioParams) +{ + struct fixture *f = data; + SKIP_IF_NO_FIXTURE; + WRITE_CLOSE(1, 0, 0, 0); + return MUNIT_OK; +} + +/* Close with an inflight AIO write . */ +TEST(UvWriterClose, aio, setUp, tearDownDeps, 0, DirAioParams) +{ + struct fixture *f = data; + SKIP_IF_NO_FIXTURE; + WRITE_CLOSE(1, 0, 0, RAFT_CANCELED); + return MUNIT_OK; +} diff --git a/test/unit/ext/test_uv.c b/test/unit/ext/test_uv.c index 7bb905d44..aad32fb47 100644 --- a/test/unit/ext/test_uv.c +++ b/test/unit/ext/test_uv.c @@ -1,9 +1,9 @@ -#include #include #include #include "../../../src/lib/transport.h" +#include "../../../src/raft.h" #include "../../lib/endpoint.h" #include "../../lib/runner.h" #include "../../lib/uv.h" diff --git a/test/unit/test_conn.c b/test/unit/test_conn.c index 93e4a475c..769c52945 100644 --- a/test/unit/test_conn.c +++ b/test/unit/test_conn.c @@ -1,8 +1,3 @@ -#include -#include - -#include "../../include/dqlite.h" - #include "../lib/client.h" #include "../lib/config.h" #include "../lib/heap.h" @@ -17,6 +12,7 @@ #include "../../src/conn.h" #include "../../src/gateway.h" #include "../../src/lib/transport.h" +#include "../../src/raft.h" #include "../../src/transport.h" TEST_MODULE(conn); diff --git a/test/unit/test_vfs.c b/test/unit/test_vfs.c index 92e5f3139..cfd103d6d 100644 --- a/test/unit/test_vfs.c +++ b/test/unit/test_vfs.c @@ -1,7 +1,6 @@ #include #include -#include #include #include "../../include/dqlite.h" @@ -13,6 +12,7 @@ #include "../lib/sqlite.h" #include "../../src/format.h" +#include "../../src/raft.h" #include "../../src/vfs.h" static char *bools[] = {"0", "1", NULL};