diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 000000000..1a85a480c
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,12 @@
+Checks: '-*,readability-identifier-naming'
+HeaderFilterRegex: '.*'
+WarningsAsErrors: '*'
+CheckOptions:
+  - key: readability-identifier-naming.StructCase
+    value: lower_case
+  - key: readability-identifier-naming.UnionCase
+    value: lower_case
+  - key: readability-identifier-naming.FunctionCase
+    value: lower_case
+  - key: readability-identifier-naming.TypedefCase
+    value: lower_case
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 40957eef4..c1c1f34ff 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -22,43 +22,33 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
-    - name: Setup dependencies
+
+    - name: Set up dependencies
       run: |
           sudo apt update
           sudo apt install -y lcov libsqlite3-dev liblz4-dev libuv1-dev
 
-    - name: Build raft
-      env:
-        CC: ${{ matrix.compiler }}
-      run: |
-          git clone https://github.com/canonical/raft.git --depth 1
-          cd raft
-          autoreconf -i
-          ./configure --enable-debug --enable-sanitize
-          make -j4
-          sudo make install
-          sudo ldconfig
-          cd ..
-
     - name: Build dqlite
       env:
         CC: ${{ matrix.compiler }}
       run: |
           autoreconf -i
-          ./configure --enable-debug --enable-code-coverage --enable-sanitize
-          make CFLAGS=-O0 -j2
+          ./configure --enable-debug --enable-code-coverage --enable-sanitize --enable-build-raft
+          make -j4
 
     - name: Test
       env:
         CC: ${{ matrix.compiler }}
       run: |
            export ${{ matrix.tracing }}
-           make CFLAGS=-O0 -j2 check || (cat ./test-suite.log && false)
+           make -j4 check || (cat ./test-suite.log && false)
 
     - name: Coverage
       env:
         CC: ${{ matrix.compiler }}
-      run: if [ "${CC}" = "gcc" ]; then make code-coverage-capture; fi
+      if: ${{ matrix.compiler == 'gcc' }}
+      run: |
+        make code-coverage-capture
 
     - name: Upload coverage to Codecov
       uses: codecov/codecov-action@v4
diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml
index a5cf901a8..69e1d4fdf 100644
--- a/.github/workflows/downstream.yml
+++ b/.github/workflows/downstream.yml
@@ -28,31 +28,17 @@ jobs:
           sudo make -j$(nproc) install
           sudo ldconfig
 
-      - name: Check out raft
-        uses: actions/checkout@v4
-        with:
-          repository: canonical/raft
-          path: raft
-
-      - name: Install raft
-        run: |
-          cd raft
-          autoreconf -i
-          ./configure --enable-debug --enable-uv --enable-sanitize --enable-backtrace
-          sudo make -j$(nproc) install
-          sudo ldconfig
-
       - name: Check out dqlite
         uses: actions/checkout@v4
         with:
-          ref: refs/pull/${{ github.event.issue.number }}/head
+          ref: refs/pull/${{ github.event.issue.number }}/merge
           path: dqlite
 
       - name: Install dqlite
         run: |
           cd dqlite
           autoreconf -i
-          ./configure --enable-debug --enable-sanitize --enable-backtrace
+          ./configure --enable-debug --enable-sanitize --enable-backtrace --enable-build-raft
           sudo make -j$(nproc)
           sudo make install
           sudo ldconfig
diff --git a/.github/workflows/external-raft.yml b/.github/workflows/external-raft.yml
new file mode 100644
index 000000000..da23836f8
--- /dev/null
+++ b/.github/workflows/external-raft.yml
@@ -0,0 +1,38 @@
+name: CI Tests (external libraft)
+
+on:
+  - push
+  - pull_request
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Setup dependencies
+      run: |
+          sudo apt update
+          sudo apt install -y libsqlite3-dev liblz4-dev libuv1-dev
+
+    - name: Build raft
+      run: |
+          git clone https://github.com/canonical/raft --depth 1
+          cd raft
+          autoreconf -i
+          ./configure --enable-debug --enable-sanitize
+          make -j4
+          sudo make install
+          sudo ldconfig
+
+    - name: Build dqlite
+      run: |
+          autoreconf -i
+          ./configure --enable-debug --enable-sanitize
+          make -j4
+
+    - name: Test
+      run: |
+           export LIBRAFT_TRACE=1 LIBDQLITE_TRACE=1
+           make -j4 check || (cat ./test-suite.log && false)
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index 3d9639c11..7f016d2f2 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -10,10 +10,20 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
-    - uses: DoozyX/clang-format-lint-action@v0.16
       with:
-        source: 'src test include'
-        exclude: 'test/lib/munit.*'
-        extensions: 'c,h'
-        clangFormatVersion: 14
-        style: file
+        fetch-depth: 2
+    - name: Install apt dependencies
+      run: |
+          sudo apt update
+          sudo apt install -y libsqlite3-dev liblz4-dev libuv1-dev bear
+    - uses: KyleMayes/install-llvm-action@master
+      with:
+        version: 17
+    - name: Run clang-format
+      run: |
+          find . \( -name '*.c' -or -name '*.h' \) -not -name 'munit.*' -path ./llvm -prune | xargs ./llvm/bin/clang-format --style=file --dry-run -Werror
+    - name: Run clang-tidy
+      run: |
+          shopt -s globstar
+          bear -- cc -D_GNU_SOURCE -DHAVE_LINUX_AIO_ABI_H -c {src,test}/**/*.c
+          git show -U0 --first-parent | ./clang-tidy-diff.py -p1 -config-file=.clang-tidy -clang-tidy-binary=./llvm/bin/clang-tidy -use-color
diff --git a/Makefile.am b/Makefile.am
index 8c8cb2581..ee5751e0d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,18 +1,33 @@
 ACLOCAL_AMFLAGS = -I m4
 AM_CFLAGS += $(CODE_COVERAGE_CFLAGS)
-AM_CFLAGS += $(SQLITE_CFLAGS) $(UV_CFLAGS) $(RAFT_CFLAGS) $(PTHREAD_CFLAGS)
-AM_LDFLAGS = $(UV_LIBS) $(RAFT_LIBS) $(PTHREAD_LIBS)
+AM_CFLAGS += $(SQLITE_CFLAGS) $(UV_CFLAGS) $(PTHREAD_CFLAGS)
+AM_LDFLAGS = $(UV_LIBS) $(PTHREAD_LIBS)
 
 if !BUILD_SQLITE_ENABLED
 AM_LDFLAGS += $(SQLITE_LIBS)
 endif
 
+if !BUILD_RAFT_ENABLED
+AM_CFLAGS += $(RAFT_CFLAGS)
+AM_LDFLAGS += $(RAFT_LIBS)
+endif
+
+if DEBUG_ENABLED
+  AM_CFLAGS += -g3
+else
+  AM_CFLAGS += -O2
+endif
+if SANITIZE_ENABLED
+  AM_CFLAGS += -fsanitize=address
+endif
+if BACKTRACE_ENABLED
+  AM_CFLAGS += -DDQLITE_ASSERT_WITH_BACKTRACE -DRAFT_ASSERT_WITH_BACKTRACE
+  AM_LDFLAGS += -lbacktrace
+endif
+
 include_HEADERS = include/dqlite.h
 
-lib_LTLIBRARIES = libdqlite.la
-libdqlite_la_CFLAGS = $(AM_CFLAGS) -fvisibility=hidden
-libdqlite_la_LDFLAGS = $(AM_LDFLAGS) -version-info 0:1:0
-libdqlite_la_SOURCES = \
+basic_dqlite_sources = \
   src/bind.c \
   src/client/protocol.c \
   src/command.c \
@@ -46,14 +61,72 @@ libdqlite_la_SOURCES = \
   src/tuple.c \
   src/vfs.c
 
+lib_LTLIBRARIES = libdqlite.la
+libdqlite_la_CFLAGS = $(AM_CFLAGS) -fvisibility=hidden -DRAFT_API=''
+libdqlite_la_LDFLAGS = $(AM_LDFLAGS) -version-info 0:1:0
+libdqlite_la_SOURCES = $(basic_dqlite_sources)
+
 if BUILD_SQLITE_ENABLED
 libdqlite_la_SOURCES += sqlite3.c
 endif
 
-check_PROGRAMS = \
-  unit-test \
-  integration-test
-TESTS = unit-test integration-test
+if BUILD_RAFT_ENABLED
+libraft_la_SOURCES = \
+  src/raft/byte.c \
+  src/raft/callbacks.c \
+  src/raft/client.c \
+  src/raft/compress.c \
+  src/raft/configuration.c \
+  src/raft/convert.c \
+  src/raft/election.c \
+  src/raft/entry.c \
+  src/raft/err.c \
+  src/raft/fixture.c \
+  src/raft/flags.c \
+  src/raft/heap.c \
+  src/raft/lifecycle.c \
+  src/raft/log.c \
+  src/raft/membership.c \
+  src/raft/progress.c \
+  src/raft/raft.c \
+  src/raft/recv.c \
+  src/raft/recv_append_entries.c \
+  src/raft/recv_append_entries_result.c \
+  src/raft/recv_request_vote.c \
+  src/raft/recv_request_vote_result.c \
+  src/raft/recv_install_snapshot.c \
+  src/raft/recv_timeout_now.c \
+  src/raft/replication.c \
+  src/raft/snapshot.c \
+  src/raft/start.c \
+  src/raft/state.c \
+  src/raft/syscall.c \
+  src/raft/tick.c \
+  src/raft/uv.c \
+  src/raft/uv_append.c \
+  src/raft/uv_encoding.c \
+  src/raft/uv_finalize.c \
+  src/raft/uv_fs.c \
+  src/raft/uv_ip.c \
+  src/raft/uv_list.c \
+  src/raft/uv_metadata.c \
+  src/raft/uv_os.c \
+  src/raft/uv_prepare.c \
+  src/raft/uv_recv.c \
+  src/raft/uv_segment.c \
+  src/raft/uv_send.c \
+  src/raft/uv_snapshot.c \
+  src/raft/uv_tcp.c \
+  src/raft/uv_tcp_listen.c \
+  src/raft/uv_tcp_connect.c \
+  src/raft/uv_truncate.c \
+  src/raft/uv_work.c \
+  src/raft/uv_writer.c
+
+libdqlite_la_SOURCES += $(libraft_la_SOURCES)
+endif # BUILD_RAFT_ENABLED
+
+check_PROGRAMS = unit-test integration-test
 
 check_LTLIBRARIES = libtest.la
 
@@ -70,7 +143,7 @@ libtest_la_SOURCES = \
   test/lib/sqlite.c \
   test/lib/uv.c
 
-unit_test_SOURCES = $(libdqlite_la_SOURCES)
+unit_test_SOURCES = $(basic_dqlite_sources)
 unit_test_SOURCES += \
   test/test_error.c \
   test/test_integration.c \
@@ -96,6 +169,10 @@ unit_test_CFLAGS = $(AM_CFLAGS) -Wno-unknown-warning-option -Wno-uninitialized -
 unit_test_LDFLAGS = $(AM_LDFLAGS)
 unit_test_LDADD = libtest.la
 
+if BUILD_RAFT_ENABLED
+unit_test_LDADD += libraft.la
+endif
+
 integration_test_SOURCES = \
   test/integration/test_client.c \
   test/integration/test_cluster.c \
@@ -110,19 +187,148 @@ integration_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion
 integration_test_LDFLAGS = $(AM_LDFLAGS) -no-install
 integration_test_LDADD = libtest.la libdqlite.la
 
-if DEBUG_ENABLED
-  AM_CFLAGS += -g3
-else
-  AM_CFLAGS += -O2
-endif
-if SANITIZE_ENABLED
-  AM_CFLAGS += -fsanitize=address
+if BUILD_RAFT_ENABLED
+check_LTLIBRARIES += libraft.la
+
+check_PROGRAMS += \
+  raft-core-unit-test \
+  raft-core-integration-test \
+  raft-uv-unit-test \
+  raft-uv-integration-test \
+  raft-core-fuzzy-test
+
+libtest_la_SOURCES += \
+  test/raft/lib/addrinfo.c \
+  test/raft/lib/fault.c \
+  test/raft/lib/fsm.c \
+  test/raft/lib/heap.c \
+  test/raft/lib/munit.c \
+  test/raft/lib/tcp.c \
+  test/raft/lib/cluster.c \
+  test/raft/lib/aio.c \
+  test/raft/lib/dir.c \
+  test/raft/lib/tcp.c \
+  test/raft/lib/loop.c
+
+libraft_la_CFLAGS = $(AM_CFLAGS)
+libraft_la_LDFLAGS = $(UV_LIBS)
+
+raft_core_unit_test_SOURCES = \
+  src/tracing.c \
+  src/raft/byte.c \
+  src/raft/compress.c \
+  src/raft/configuration.c \
+  src/raft/err.c \
+  src/raft/flags.c \
+  src/raft/heap.c \
+  src/raft/log.c \
+  test/raft/unit/main_core.c \
+  test/raft/unit/test_byte.c \
+  test/raft/unit/test_compress.c \
+  test/raft/unit/test_configuration.c \
+  test/raft/unit/test_err.c \
+  test/raft/unit/test_flags.c \
+  test/raft/unit/test_log.c \
+  test/raft/unit/test_queue.c
+raft_core_unit_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion
+raft_core_unit_test_LDADD = libtest.la
+
+raft_core_integration_test_SOURCES = \
+  src/tracing.c \
+  test/raft/integration/main_core.c \
+  test/raft/integration/test_apply.c \
+  test/raft/integration/test_assign.c \
+  test/raft/integration/test_barrier.c \
+  test/raft/integration/test_bootstrap.c \
+  test/raft/integration/test_digest.c \
+  test/raft/integration/test_election.c \
+  test/raft/integration/test_fixture.c \
+  test/raft/integration/test_heap.c \
+  test/raft/integration/test_init.c \
+  test/raft/integration/test_membership.c \
+  test/raft/integration/test_recover.c \
+  test/raft/integration/test_replication.c \
+  test/raft/integration/test_snapshot.c \
+  test/raft/integration/test_start.c \
+  test/raft/integration/test_strerror.c \
+  test/raft/integration/test_tick.c \
+  test/raft/integration/test_transfer.c \
+  test/raft/integration/test_voter_contacts.c
+raft_core_integration_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion
+raft_core_integration_test_LDFLAGS = -no-install
+raft_core_integration_test_LDADD = libtest.la libraft.la
+
+raft_core_fuzzy_test_SOURCES = \
+  src/tracing.c \
+  test/raft/fuzzy/main_core.c \
+  test/raft/fuzzy/test_election.c \
+  test/raft/fuzzy/test_liveness.c \
+  test/raft/fuzzy/test_membership.c \
+  test/raft/fuzzy/test_replication.c
+raft_core_fuzzy_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion
+raft_core_fuzzy_test_LDFLAGS = -no-install
+raft_core_fuzzy_test_LDADD = libtest.la libraft.la
+
+raft_uv_unit_test_SOURCES = \
+  src/raft/err.c \
+  src/raft/heap.c \
+  src/raft/syscall.c \
+  src/raft/uv_fs.c \
+  src/raft/uv_os.c \
+  src/raft/uv_writer.c \
+  test/raft/unit/main_uv.c \
+  test/raft/unit/test_uv_fs.c \
+  test/raft/unit/test_uv_os.c \
+  test/raft/unit/test_uv_writer.c
+raft_uv_unit_test_CFLAGS = $(AM_CFLAGS) -Wno-conversion
+raft_uv_unit_test_LDADD = libtest.la $(UV_LIBS)
+
+# The integration/uv test is not linked to libraft, but built
+# directly against the libraft sources in order to test some
+# non-visible, non-API functions.
+raft_uv_integration_test_SOURCES = \
+  $(libraft_la_SOURCES) \
+  src/tracing.c \
+  test/raft/integration/main_uv.c \
+  test/raft/integration/test_uv_init.c \
+  test/raft/integration/test_uv_append.c \
+  test/raft/integration/test_uv_bootstrap.c \
+  test/raft/integration/test_uv_load.c \
+  test/raft/integration/test_uv_recover.c \
+  test/raft/integration/test_uv_recv.c \
+  test/raft/integration/test_uv_send.c \
+  test/raft/integration/test_uv_set_term.c \
+  test/raft/integration/test_uv_tcp_connect.c \
+  test/raft/integration/test_uv_tcp_listen.c \
+  test/raft/integration/test_uv_snapshot_put.c \
+  test/raft/integration/test_uv_truncate.c \
+  test/raft/integration/test_uv_truncate_snapshot.c \
+  test/raft/integration/test_uv_work.c
+raft_uv_integration_test_CFLAGS = $(AM_CFLAGS) -Wno-type-limits -Wno-conversion
+raft_uv_integration_test_LDFLAGS = -no-install
+raft_uv_integration_test_LDADD = libtest.la $(UV_LIBS)
+
+if LZ4_AVAILABLE
+libdqlite_la_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS)
+libdqlite_la_LDFLAGS += $(LZ4_LIBS)
+raft_core_unit_test_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS)
+raft_core_unit_test_LDFLAGS = $(LZ4_LIBS)
+libraft_la_CFLAGS += -DLZ4_AVAILABLE $(LZ4_CFLAGS)
+libraft_la_LDFLAGS += $(LZ4_LIBS)
+raft_uv_integration_test_CFLAGS += -DLZ4_AVAILABLE
+raft_uv_integration_test_LDFLAGS += $(LZ4_LIBS)
 endif
-if BACKTRACE_ENABLED
-  AM_CFLAGS += -DDQLITE_ASSERT_WITH_BACKTRACE
-  AM_LDFLAGS += -lbacktrace
+if LZ4_ENABLED
+libdqlite_la_CFLAGS += -DLZ4_ENABLED
+raft_uv_integration_test_CFLAGS += -DLZ4_ENABLED
+raft_core_unit_test_CFLAGS += -DLZ4_ENABLED
+libraft_la_CFLAGS += -DLZ4_ENABLED
 endif
 
+endif # BUILD_RAFT_ENABLED
+
+TESTS = $(check_PROGRAMS)
+
 if CODE_COVERAGE_ENABLED
 
 include $(top_srcdir)/aminclude_static.am
diff --git a/README.md b/README.md
index 52e68abcb..5a26f5185 100644
--- a/README.md
+++ b/README.md
@@ -3,13 +3,13 @@ dqlite [![CI Tests](https://github.com/canonical/dqlite/actions/workflows/build-
 
 [English](./README.md)|[简体中文](./README_CH.md)
 
-[dqlite](https://dqlite.io) is a C library that implements an embeddable and replicated SQL database
-engine with high availability and automatic failover.
+[dqlite](https://dqlite.io) is a C library that implements an embeddable and
+replicated SQL database engine with high availability and automatic failover.
 
-The acronym "dqlite" stands for "distributed SQLite", meaning that dqlite extends
-[SQLite](https://sqlite.org/) with a network protocol that can connect together
-various instances of your application and have them act as a highly-available
-cluster, with no dependency on external databases.
+The acronym "dqlite" stands for "distributed SQLite", meaning that dqlite
+extends [SQLite](https://sqlite.org/) with a network protocol that can connect
+together various instances of your application and have them act as a
+highly-available cluster, with no dependency on external databases.
 
 Design highlights
 ----------------
@@ -17,24 +17,23 @@ Design highlights
 * Asynchronous single-threaded implementation using [libuv](https://libuv.org/)
   as event loop.
 * Custom wire protocol optimized for SQLite primitives and data types.
-* Data replication based on the [Raft](https://raft.github.io/) algorithm and its
-  efficient [C-raft](https://github.com/canonical/raft) implementation.
+* Data replication based on the [Raft](https://raft.github.io/) algorithm.
 
 License
 -------
 
-The dqlite library is released under a slightly modified version of LGPLv3, that
-includes a copyright exception allowing users to statically link the library code
-in their project and release the final work under their own terms. See the full
-[license](https://github.com/canonical/dqlite/blob/master/LICENSE) text.
+The dqlite library is released under a slightly modified version of LGPLv3,
+that includes a copyright exception allowing users to statically link the
+library code in their project and release the final work under their own terms.
+See the full [license](https://github.com/canonical/dqlite/blob/master/LICENSE)
+text.
 
 Compatibility
 -------------
 
 dqlite runs on Linux and requires a kernel with support for [native async
 I/O](https://man7.org/linux/man-pages/man2/io_setup.2.html) (not to be confused
-with [POSIX AIO](https://man7.org/linux/man-pages/man7/aio.7.html)), which is
-used by the libuv backend of C-raft.
+with [POSIX AIO](https://man7.org/linux/man-pages/man7/aio.7.html)).
 
 Try it
 -------
@@ -49,24 +48,26 @@ Media
 A talk about dqlite was given at FOSDEM 2020, you can watch it
 [here](https://fosdem.org/2020/schedule/event/dqlite/).
 
-[Here](https://gcore.com/blog/comparing-litestream-rqlite-dqlite/) is a blog post from 2022 comparing dqlite with rqlite and Litestream, other replication software for SQLite.
+[Here](https://gcore.com/blog/comparing-litestream-rqlite-dqlite/) is a blog
+post from 2022 comparing dqlite with rqlite and Litestream, other replication
+software for SQLite.
 
 Wire protocol
 -------------
 
-If you wish to write a client, please refer to the [wire protocol](https://dqlite.io/docs/protocol)
-documentation.
+If you wish to write a client, please refer to the [wire
+protocol](https://dqlite.io/docs/protocol) documentation.
 
 Install
 -------
 
-If you are on a Debian-based system, you can get the latest development release from
-dqlite's [dev PPA](https://launchpad.net/~dqlite/+archive/ubuntu/dev):
+If you are on a Debian-based system, you can get the latest development release
+from dqlite's [dev PPA](https://launchpad.net/~dqlite/+archive/ubuntu/dev):
 
 ```
 sudo add-apt-repository ppa:dqlite/dev
-sudo apt-get update
-sudo apt-get install libdqlite-dev
+sudo apt update
+sudo apt install libdqlite-dev
 ```
 
 Build
@@ -74,45 +75,50 @@ Build
 
 To build libdqlite from source you'll need:
 
-* A reasonably recent version of [libuv](http://libuv.org/) (v1.8.0 or beyond).
-* A reasonably recent version of sqlite3-dev
-* A build of the [C-raft](https://github.com/canonical/raft) Raft library.
+* Build dependencies: pkg-config and GNU Autoconf, Automake, libtool, and make
+* A reasonably recent version of [libuv](https://libuv.org/) (v1.8.0 or later), with headers.
+* A reasonably recent version of [SQLite](https://sqlite.org/) (v3.22.0 or later), with headers.
+* Optionally, a reasonably recent version of [LZ4](https://lz4.org/) (v1.7.1 or later), with headers.
 
-Your distribution should already provide you with a pre-built libuv shared
-library and libsqlite3-dev.
+Your distribution should already provide you with these dependencies. For
+example, on Debian-based distros:
 
-For the Debian-based Linux distros you can install the build dependencies with:
+```
+sudo apt install pkg-config autoconf automake libtool make libuv1-dev libsqlite3-dev liblz4-dev
+```
+
+With these dependencies installed, you can build and install the dqlite shared
+library and headers as follows:
 
 ```
-sudo apt install autoconf libuv1-dev liblz4-dev libtool pkg-config build-essential libsqlite3-dev
+$ autoreconf -i
+$ ./configure --enable-build-raft
+$ make
+$ sudo make install
 ```
 
-To build the raft library:
+The default installation prefix is `/usr/local`; you may need to run
 
 ```
-git clone https://github.com/canonical/raft.git
-cd raft
-autoreconf -i
-./configure
-make
-sudo make install
-cd ..
+$ sudo ldconfig
 ```
 
-Once all the required libraries are installed, in order to build the dqlite
-shared library itself, you can run:
+to enable the linker to find `libdqlite.so`. To install to a different prefix,
+replace the configure step with something like
 
 ```
-autoreconf -i
-./configure
-make
-sudo make install
+$ ./configure --enable-build-raft --prefix=/usr
 ```
 
+The `--enable-build-raft` option causes dqlite to use its bundled Raft
+implementation instead of linking to an external libraft; the latter is a
+legacy configuration that should not be used for new development.
+
 Usage Notes
 -----------
 
-Detailed tracing will be enabled when the environment variable `LIBDQLITE_TRACE` is set before startup.
-The value of it can be in `[0..5]` range and reperesents a tracing level, where
-`0` means "no traces" emitted, `5` enables minimum (FATAL records only), and `1`
-enables maximum verbosity (all: DEBUG, INFO, WARN, ERROR, FATAL records).
+Detailed tracing will be enabled when the environment variable
+`LIBDQLITE_TRACE` is set before startup.  The value of it can be in `[0..5]`
+range and reperesents a tracing level, where `0` means "no traces" emitted, `5`
+enables minimum (FATAL records only), and `1` enables maximum verbosity (all:
+DEBUG, INFO, WARN, ERROR, FATAL records).
diff --git a/clang-tidy-diff.py b/clang-tidy-diff.py
new file mode 100755
index 000000000..d96b3450f
--- /dev/null
+++ b/clang-tidy-diff.py
@@ -0,0 +1,382 @@
+#!/usr/bin/env python3
+#
+# ===- clang-tidy-diff.py - ClangTidy Diff Checker -----------*- python -*--===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===-----------------------------------------------------------------------===#
+
+r"""
+ClangTidy Diff Checker
+======================
+
+This script reads input from a unified diff, runs clang-tidy on all changed
+files and outputs clang-tidy warnings in changed lines only. This is useful to
+detect clang-tidy regressions in the lines touched by a specific patch.
+Example usage for git/svn users:
+
+  git diff -U0 HEAD^ | clang-tidy-diff.py -p1
+  svn diff --diff-cmd=diff -x-U0 | \
+      clang-tidy-diff.py -fix -checks=-*,modernize-use-override
+
+"""
+
+import argparse
+import glob
+import json
+import multiprocessing
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import threading
+import traceback
+
+try:
+    import yaml
+except ImportError:
+    yaml = None
+
+is_py2 = sys.version[0] == "2"
+
+if is_py2:
+    import Queue as queue
+else:
+    import queue as queue
+
+
+def run_tidy(task_queue, lock, timeout, failed_files):
+    watchdog = None
+    while True:
+        command = task_queue.get()
+        try:
+            proc = subprocess.Popen(
+                command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+
+            if timeout is not None:
+                watchdog = threading.Timer(timeout, proc.kill)
+                watchdog.start()
+
+            stdout, stderr = proc.communicate()
+            if proc.returncode != 0:
+                if proc.returncode < 0:
+                    msg = "Terminated by signal %d : %s\n" % (
+                        -proc.returncode,
+                        " ".join(command),
+                    )
+                    stderr += msg.encode("utf-8")
+                failed_files.append(command)
+
+            with lock:
+                sys.stdout.write(stdout.decode("utf-8") + "\n")
+                sys.stdout.flush()
+                if stderr:
+                    sys.stderr.write(stderr.decode("utf-8") + "\n")
+                    sys.stderr.flush()
+        except Exception as e:
+            with lock:
+                sys.stderr.write("Failed: " + str(e) + ": ".join(command) + "\n")
+        finally:
+            with lock:
+                if not (timeout is None or watchdog is None):
+                    if not watchdog.is_alive():
+                        sys.stderr.write(
+                            "Terminated by timeout: " + " ".join(command) + "\n"
+                        )
+                    watchdog.cancel()
+            task_queue.task_done()
+
+
+def start_workers(max_tasks, tidy_caller, arguments):
+    for _ in range(max_tasks):
+        t = threading.Thread(target=tidy_caller, args=arguments)
+        t.daemon = True
+        t.start()
+
+
+def merge_replacement_files(tmpdir, mergefile):
+    """Merge all replacement files in a directory into a single file"""
+    # The fixes suggested by clang-tidy >= 4.0.0 are given under
+    # the top level key 'Diagnostics' in the output yaml files
+    mergekey = "Diagnostics"
+    merged = []
+    for replacefile in glob.iglob(os.path.join(tmpdir, "*.yaml")):
+        content = yaml.safe_load(open(replacefile, "r"))
+        if not content:
+            continue  # Skip empty files.
+        merged.extend(content.get(mergekey, []))
+
+    if merged:
+        # MainSourceFile: The key is required by the definition inside
+        # include/clang/Tooling/ReplacementsYaml.h, but the value
+        # is actually never used inside clang-apply-replacements,
+        # so we set it to '' here.
+        output = {"MainSourceFile": "", mergekey: merged}
+        with open(mergefile, "w") as out:
+            yaml.safe_dump(output, out)
+    else:
+        # Empty the file:
+        open(mergefile, "w").close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run clang-tidy against changed files, and "
+        "output diagnostics only for modified "
+        "lines."
+    )
+    parser.add_argument(
+        "-clang-tidy-binary",
+        metavar="PATH",
+        default="clang-tidy",
+        help="path to clang-tidy binary",
+    )
+    parser.add_argument(
+        "-p",
+        metavar="NUM",
+        default=0,
+        help="strip the smallest prefix containing P slashes",
+    )
+    parser.add_argument(
+        "-regex",
+        metavar="PATTERN",
+        default=None,
+        help="custom pattern selecting file paths to check "
+        "(case sensitive, overrides -iregex)",
+    )
+    parser.add_argument(
+        "-iregex",
+        metavar="PATTERN",
+        default=r".*\.(cpp|cc|c\+\+|cxx|c|cl|h|hpp|m|mm|inc)",
+        help="custom pattern selecting file paths to check "
+        "(case insensitive, overridden by -regex)",
+    )
+    parser.add_argument(
+        "-j",
+        type=int,
+        default=1,
+        help="number of tidy instances to be run in parallel.",
+    )
+    parser.add_argument(
+        "-timeout", type=int, default=None, help="timeout per each file in seconds."
+    )
+    parser.add_argument(
+        "-fix", action="store_true", default=False, help="apply suggested fixes"
+    )
+    parser.add_argument(
+        "-checks",
+        help="checks filter, when not specified, use clang-tidy " "default",
+        default="",
+    )
+    parser.add_argument(
+        "-config-file",
+        dest="config_file",
+        help="Specify the path of .clang-tidy or custom config file",
+        default="",
+    )
+    parser.add_argument("-use-color", action="store_true", help="Use colors in output")
+    parser.add_argument(
+        "-path", dest="build_path", help="Path used to read a compile command database."
+    )
+    if yaml:
+        parser.add_argument(
+            "-export-fixes",
+            metavar="FILE_OR_DIRECTORY",
+            dest="export_fixes",
+            help="A directory or a yaml file to store suggested fixes in, "
+            "which can be applied with clang-apply-replacements. If the "
+            "parameter is a directory, the fixes of each compilation unit are "
+            "stored in individual yaml files in the directory.",
+        )
+    else:
+        parser.add_argument(
+            "-export-fixes",
+            metavar="DIRECTORY",
+            dest="export_fixes",
+            help="A directory to store suggested fixes in, which can be applied "
+            "with clang-apply-replacements. The fixes of each compilation unit are "
+            "stored in individual yaml files in the directory.",
+        )
+    parser.add_argument(
+        "-extra-arg",
+        dest="extra_arg",
+        action="append",
+        default=[],
+        help="Additional argument to append to the compiler " "command line.",
+    )
+    parser.add_argument(
+        "-extra-arg-before",
+        dest="extra_arg_before",
+        action="append",
+        default=[],
+        help="Additional argument to prepend to the compiler " "command line.",
+    )
+    parser.add_argument(
+        "-quiet",
+        action="store_true",
+        default=False,
+        help="Run clang-tidy in quiet mode",
+    )
+    parser.add_argument(
+        "-load",
+        dest="plugins",
+        action="append",
+        default=[],
+        help="Load the specified plugin in clang-tidy.",
+    )
+
+    clang_tidy_args = []
+    argv = sys.argv[1:]
+    if "--" in argv:
+        clang_tidy_args.extend(argv[argv.index("--") :])
+        argv = argv[: argv.index("--")]
+
+    args = parser.parse_args(argv)
+
+    # Extract changed lines for each file.
+    filename = None
+    lines_by_file = {}
+    for line in sys.stdin:
+        match = re.search('^\+\+\+\ "?(.*?/){%s}([^ \t\n"]*)' % args.p, line)
+        if match:
+            filename = match.group(2)
+        if filename is None:
+            continue
+
+        if args.regex is not None:
+            if not re.match("^%s$" % args.regex, filename):
+                continue
+        else:
+            if not re.match("^%s$" % args.iregex, filename, re.IGNORECASE):
+                continue
+
+        match = re.search("^@@.*\+(\d+)(,(\d+))?", line)
+        if match:
+            start_line = int(match.group(1))
+            line_count = 1
+            if match.group(3):
+                line_count = int(match.group(3))
+            if line_count == 0:
+                continue
+            end_line = start_line + line_count - 1
+            lines_by_file.setdefault(filename, []).append([start_line, end_line])
+
+    if not any(lines_by_file):
+        print("No relevant changes found.")
+        sys.exit(0)
+
+    max_task_count = args.j
+    if max_task_count == 0:
+        max_task_count = multiprocessing.cpu_count()
+    max_task_count = min(len(lines_by_file), max_task_count)
+
+    combine_fixes = False
+    export_fixes_dir = None
+    delete_fixes_dir = False
+    if args.export_fixes is not None:
+        # if a directory is given, create it if it does not exist
+        if args.export_fixes.endswith(os.path.sep) and not os.path.isdir(
+            args.export_fixes
+        ):
+            os.makedirs(args.export_fixes)
+
+        if not os.path.isdir(args.export_fixes):
+            if not yaml:
+                raise RuntimeError(
+                    "Cannot combine fixes in one yaml file. Either install PyYAML or specify an output directory."
+                )
+
+            combine_fixes = True
+
+        if os.path.isdir(args.export_fixes):
+            export_fixes_dir = args.export_fixes
+
+    if combine_fixes:
+        export_fixes_dir = tempfile.mkdtemp()
+        delete_fixes_dir = True
+
+    # Tasks for clang-tidy.
+    task_queue = queue.Queue(max_task_count)
+    # A lock for console output.
+    lock = threading.Lock()
+
+    # List of files with a non-zero return code.
+    failed_files = []
+
+    # Run a pool of clang-tidy workers.
+    start_workers(
+        max_task_count, run_tidy, (task_queue, lock, args.timeout, failed_files)
+    )
+
+    # Form the common args list.
+    common_clang_tidy_args = []
+    if args.fix:
+        common_clang_tidy_args.append("-fix")
+    if args.checks != "":
+        common_clang_tidy_args.append("-checks=" + args.checks)
+    if args.config_file != "":
+        common_clang_tidy_args.append("-config-file=" + args.config_file)
+    if args.quiet:
+        common_clang_tidy_args.append("-quiet")
+    if args.build_path is not None:
+        common_clang_tidy_args.append("-p=%s" % args.build_path)
+    if args.use_color:
+        common_clang_tidy_args.append("--use-color")
+    for arg in args.extra_arg:
+        common_clang_tidy_args.append("-extra-arg=%s" % arg)
+    for arg in args.extra_arg_before:
+        common_clang_tidy_args.append("-extra-arg-before=%s" % arg)
+    for plugin in args.plugins:
+        common_clang_tidy_args.append("-load=%s" % plugin)
+
+    for name in lines_by_file:
+        line_filter_json = json.dumps(
+            [{"name": name, "lines": lines_by_file[name]}], separators=(",", ":")
+        )
+
+        # Run clang-tidy on files containing changes.
+        command = [args.clang_tidy_binary]
+        command.append("-line-filter=" + line_filter_json)
+        if args.export_fixes is not None:
+            # Get a temporary file. We immediately close the handle so clang-tidy can
+            # overwrite it.
+            (handle, tmp_name) = tempfile.mkstemp(suffix=".yaml", dir=export_fixes_dir)
+            os.close(handle)
+            command.append("-export-fixes=" + tmp_name)
+        command.extend(common_clang_tidy_args)
+        command.append(name)
+        command.extend(clang_tidy_args)
+
+        task_queue.put(command)
+
+    # Application return code
+    return_code = 0
+
+    # Wait for all threads to be done.
+    task_queue.join()
+    # Application return code
+    return_code = 0
+    if failed_files:
+        return_code = 1
+
+    if combine_fixes:
+        print("Writing fixes to " + args.export_fixes + " ...")
+        try:
+            merge_replacement_files(export_fixes_dir, args.export_fixes)
+        except:
+            sys.stderr.write("Error exporting fixes.\n")
+            traceback.print_exc()
+            return_code = 1
+
+    if delete_fixes_dir:
+        shutil.rmtree(export_fixes_dir)
+    sys.exit(return_code)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/configure.ac b/configure.ac
index 19cbdf701..99be7875a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -6,7 +6,7 @@ AC_CONFIG_AUX_DIR([ac])
 AM_INIT_AUTOMAKE([subdir-objects -Wall -Werror -Wno-portability foreign])
 AM_SILENT_RULES([yes])
 
-AC_PROG_CC_STDC
+AC_PROG_CC
 AC_USE_SYSTEM_EXTENSIONS
 
 AX_PTHREAD
@@ -30,21 +30,24 @@ AM_COND_IF(SANITIZE_ENABLED,
 
 AC_ARG_ENABLE(backtrace, AS_HELP_STRING([--enable-backtrace[=ARG]], [print backtrace on assertion failure [default=no]]))
 AM_CONDITIONAL(BACKTRACE_ENABLED, test "x$enable_backtrace" = "xyes")
+
 AC_ARG_ENABLE(build-sqlite, AS_HELP_STRING([--enable-build-sqlite[=ARG]], [build libsqlite3 from sqlite3.c in the build root [default=no]]))
 AM_CONDITIONAL(BUILD_SQLITE_ENABLED, test "x$enable_build_sqlite" = "xyes")
 
+AC_ARG_ENABLE(build-raft, AS_HELP_STRING([--enable-build-raft[=ARG]], [use the bundled raft sources instead of linking to libraft [default=no]]))
+AM_CONDITIONAL(BUILD_RAFT_ENABLED, test "x$enable_build_raft" = "xyes")
+
+# Allow not linking to liblz4 even if it's present.
+AC_ARG_WITH([lz4], AS_HELP_STRING([--without-lz4], [never link to liblz4]))
+
 # Whether to enable code coverage.
 AX_CODE_COVERAGE
 
 # Checks for header files.
-AC_CHECK_HEADERS([arpa/inet.h fcntl.h stdint.h stdlib.h string.h sys/socket.h unistd.h])
+AC_CHECK_HEADERS([linux/io_uring.h linux/aio_abi.h])
 
-# Checks for typedefs, structures, and compiler characteristics.
-AC_TYPE_SIZE_T
-AC_TYPE_SSIZE_T
-AC_TYPE_UINT16_T
-AC_TYPE_UINT32_T
-AC_TYPE_UINT8_T
+# Checks for library functions and definitions.
+AC_CHECK_DECLS(RWF_NOWAIT, [], [AC_MSG_ERROR(Linux kernel >= 4.14 required.)], [#include <linux/aio_abi.h>])
 
 # Enable large file support. This is mandatory in order to interoperate with
 # libuv, which enables large file support by default, making the size of 'off_t'
@@ -54,7 +57,17 @@ AC_SYS_LARGEFILE
 # Checks for libraries
 PKG_CHECK_MODULES(SQLITE, [sqlite3 >= 3.22.0], [], [])
 PKG_CHECK_MODULES(UV, [libuv >= 1.8.0], [], [])
-PKG_CHECK_MODULES(RAFT, [raft >= 0.18.1], [], [])
+AS_IF([test "x$enable_build_raft" != "xyes"], [PKG_CHECK_MODULES(RAFT, [raft >= 0.18.1], [], [])], [])
+
+AS_IF([test "x$with_lz4" != "xno"], [PKG_CHECK_MODULES(LZ4, [liblz4 >= 1.7.1], [have_lz4=yes], [have_lz4=no])], [have_lz4=no])
+AS_IF([test "x$with_lz4" != "xno" -a "x$have_lz4" = "xno"], [AC_MSG_ERROR([liblz4 required but not found])], [])
+AM_CONDITIONAL(LZ4_AVAILABLE, test "x$have_lz4" = "xyes")
+
+AC_ARG_ENABLE(lz4, AS_HELP_STRING([--disable-lz4], [when building with lz4, do not compress snapshots by default]))
+AS_IF([test "x$enable_lz4" != "x" -a "x$have_lz4" = "xno"],
+      [AC_MSG_ERROR([snapshot compression (either by default or not) requires liblz4])],
+      [])
+AM_CONDITIONAL(LZ4_ENABLED, test "x$enable_lz4" != "xno" -a "x$have_lz4" = "xyes")
 
 CC_CHECK_FLAGS_APPEND([AM_CFLAGS],[CFLAGS],[ \
   -std=c11 \
diff --git a/include/dqlite.h b/include/dqlite.h
index ad57bd208..0b24b399f 100644
--- a/include/dqlite.h
+++ b/include/dqlite.h
@@ -19,8 +19,9 @@
  */
 #define DQLITE_EXPERIMENTAL
 
-/* XXX */
+#ifndef DQLITE_VISIBLE_TO_TESTS
 #define DQLITE_VISIBLE_TO_TESTS DQLITE_API
+#endif
 
 /**
  * Version.
diff --git a/m4/ax_pthread.m4 b/m4/ax_pthread.m4
index 1598d077f..9f35d1391 100644
--- a/m4/ax_pthread.m4
+++ b/m4/ax_pthread.m4
@@ -14,20 +14,24 @@
 #   flags that are needed. (The user can also force certain compiler
 #   flags/libs to be tested by setting these environment variables.)
 #
-#   Also sets PTHREAD_CC to any special C compiler that is needed for
-#   multi-threaded programs (defaults to the value of CC otherwise). (This
-#   is necessary on AIX to use the special cc_r compiler alias.)
+#   Also sets PTHREAD_CC and PTHREAD_CXX to any special C compiler that is
+#   needed for multi-threaded programs (defaults to the value of CC
+#   respectively CXX otherwise). (This is necessary on e.g. AIX to use the
+#   special cc_r/CC_r compiler alias.)
 #
 #   NOTE: You are assumed to not only compile your program with these flags,
 #   but also to link with them as well. For example, you might link with
 #   $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
+#   $PTHREAD_CXX $CXXFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
 #
 #   If you are only building threaded programs, you may wish to use these
 #   variables in your default LIBS, CFLAGS, and CC:
 #
 #     LIBS="$PTHREAD_LIBS $LIBS"
 #     CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+#     CXXFLAGS="$CXXFLAGS $PTHREAD_CFLAGS"
 #     CC="$PTHREAD_CC"
+#     CXX="$PTHREAD_CXX"
 #
 #   In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant
 #   has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to
@@ -83,7 +87,7 @@
 #   modified version of the Autoconf Macro, you may extend this special
 #   exception to the GPL to apply to your modified version as well.
 
-#serial 27
+#serial 31
 
 AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD])
 AC_DEFUN([AX_PTHREAD], [
@@ -105,6 +109,7 @@ if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then
         ax_pthread_save_CFLAGS="$CFLAGS"
         ax_pthread_save_LIBS="$LIBS"
         AS_IF([test "x$PTHREAD_CC" != "x"], [CC="$PTHREAD_CC"])
+        AS_IF([test "x$PTHREAD_CXX" != "x"], [CXX="$PTHREAD_CXX"])
         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
         LIBS="$PTHREAD_LIBS $LIBS"
         AC_MSG_CHECKING([for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS])
@@ -386,7 +391,7 @@ if test "x$ax_pthread_clang" = "xyes"; then
              # step
              ax_pthread_save_ac_link="$ac_link"
              ax_pthread_sed='s/conftest\.\$ac_ext/conftest.$ac_objext/g'
-             ax_pthread_link_step=`$as_echo "$ac_link" | sed "$ax_pthread_sed"`
+             ax_pthread_link_step=`AS_ECHO(["$ac_link"]) | sed "$ax_pthread_sed"`
              ax_pthread_2step_ac_link="($ac_compile) && (echo ==== >&5) && ($ax_pthread_link_step)"
              ax_pthread_save_CFLAGS="$CFLAGS"
              for ax_pthread_try in '' -Qunused-arguments -Wno-unused-command-line-argument unknown; do
@@ -482,18 +487,28 @@ if test "x$ax_pthread_ok" = "xyes"; then
                     [#handle absolute path differently from PATH based program lookup
                      AS_CASE(["x$CC"],
                          [x/*],
-                         [AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"])],
-                         [AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC])])])
+                         [
+			   AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"])
+			   AS_IF([test "x${CXX}" != "x"], [AS_IF([AS_EXECUTABLE_P([${CXX}_r])],[PTHREAD_CXX="${CXX}_r"])])
+			 ],
+                         [
+			   AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC])
+			   AS_IF([test "x${CXX}" != "x"], [AC_CHECK_PROGS([PTHREAD_CXX],[${CXX}_r],[$CXX])])
+			 ]
+                     )
+                    ])
                 ;;
             esac
         fi
 fi
 
 test -n "$PTHREAD_CC" || PTHREAD_CC="$CC"
+test -n "$PTHREAD_CXX" || PTHREAD_CXX="$CXX"
 
 AC_SUBST([PTHREAD_LIBS])
 AC_SUBST([PTHREAD_CFLAGS])
 AC_SUBST([PTHREAD_CC])
+AC_SUBST([PTHREAD_CXX])
 
 # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
 if test "x$ax_pthread_ok" = "xyes"; then
diff --git a/src/command.h b/src/command.h
index f41adc6a5..4b911d778 100644
--- a/src/command.h
+++ b/src/command.h
@@ -5,11 +5,10 @@
 #ifndef COMMAND_H_
 #define COMMAND_H_
 
-#include <raft.h>
-
 #include "../include/dqlite.h"
 
 #include "lib/serialize.h"
+#include "raft.h"
 
 /* Command type codes */
 enum { COMMAND_OPEN = 1, COMMAND_FRAMES, COMMAND_UNDO, COMMAND_CHECKPOINT };
diff --git a/src/conn.h b/src/conn.h
index fb93c8f27..0ae2b1299 100644
--- a/src/conn.h
+++ b/src/conn.h
@@ -5,8 +5,6 @@
 #ifndef DQLITE_CONN_H_
 #define DQLITE_CONN_H_
 
-#include <raft/uv.h>
-
 #include "lib/buffer.h"
 #include "lib/queue.h"
 #include "lib/transport.h"
@@ -14,6 +12,7 @@
 #include "gateway.h"
 #include "id.h"
 #include "message.h"
+#include "raft.h"
 
 /**
  * Callbacks.
diff --git a/src/fsm.c b/src/fsm.c
index 2af791254..4daa86790 100644
--- a/src/fsm.c
+++ b/src/fsm.c
@@ -1,10 +1,9 @@
-#include <raft.h>
-
 #include "lib/assert.h"
 #include "lib/serialize.h"
 
 #include "command.h"
 #include "fsm.h"
+#include "raft.h"
 #include "tracing.h"
 #include "vfs.h"
 
diff --git a/src/fsm.h b/src/fsm.h
index 5c849c186..dcb5c828d 100644
--- a/src/fsm.h
+++ b/src/fsm.h
@@ -5,9 +5,8 @@
 #ifndef DQLITE_FSM_H_
 #define DQLITE_FSM_H_
 
-#include <raft.h>
-
 #include "config.h"
+#include "raft.h"
 #include "registry.h"
 
 /**
diff --git a/src/gateway.h b/src/gateway.h
index dd07fdd59..208d42fc8 100644
--- a/src/gateway.h
+++ b/src/gateway.h
@@ -5,8 +5,6 @@
 #ifndef DQLITE_GATEWAY_H_
 #define DQLITE_GATEWAY_H_
 
-#include <raft.h>
-
 #include "../include/dqlite.h"
 
 #include "lib/buffer.h"
@@ -15,6 +13,7 @@
 #include "config.h"
 #include "id.h"
 #include "leader.h"
+#include "raft.h"
 #include "registry.h"
 #include "stmt.h"
 
diff --git a/src/leader.h b/src/leader.h
index 38f245967..30c541a77 100644
--- a/src/leader.h
+++ b/src/leader.h
@@ -5,12 +5,12 @@
 #ifndef LEADER_H_
 #define LEADER_H_
 
-#include <raft.h>
 #include <sqlite3.h>
 #include <stdbool.h>
 
 #include "./lib/queue.h"
 #include "db.h"
+#include "raft.h"
 
 #define SQLITE_IOERR_NOT_LEADER (SQLITE_IOERR | (40 << 8))
 #define SQLITE_IOERR_LEADERSHIP_LOST (SQLITE_IOERR | (41 << 8))
diff --git a/src/lib/transport.c b/src/lib/transport.c
index f833266cb..8ea086215 100644
--- a/src/lib/transport.c
+++ b/src/lib/transport.c
@@ -1,4 +1,4 @@
-#include <raft.h>
+#include "../raft.h"
 
 #include "../../include/dqlite.h"
 
diff --git a/src/logger.h b/src/logger.h
index f4b38db97..a5cf4813e 100644
--- a/src/logger.h
+++ b/src/logger.h
@@ -1,7 +1,7 @@
 #ifndef LOGGER_H_
 #define LOGGER_H_
 
-#include <raft.h>
+#include "raft.h"
 
 #include "../include/dqlite.h"
 
diff --git a/src/raft.h b/src/raft.h
new file mode 100644
index 000000000..7f8496c58
--- /dev/null
+++ b/src/raft.h
@@ -0,0 +1,1953 @@
+#ifndef RAFT_H
+#define RAFT_H
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <uv.h>
+
+#ifndef RAFT_API
+#define RAFT_API __attribute__((visibility("default")))
+#endif
+
+#ifndef DQLITE_VISIBLE_TO_TESTS
+#define DQLITE_VISIBLE_TO_TESTS __attribute__((visibility("default")))
+#endif
+
+/**
+ * Version.
+ */
+#define RAFT_VERSION_MAJOR 0
+#define RAFT_VERSION_MINOR 18
+#define RAFT_VERSION_RELEASE 0
+#define RAFT_VERSION_NUMBER                                          \
+	(RAFT_VERSION_MAJOR * 100 * 100 + RAFT_VERSION_MINOR * 100 + \
+	 RAFT_VERSION_RELEASE)
+
+int raft_version_number(void);
+
+/**
+ * Error codes.
+ */
+enum {
+	RAFT_NOMEM = 1,        /* Out of memory */
+	RAFT_BADID,            /* Server ID is not valid */
+	RAFT_DUPLICATEID,      /* Server ID already in use */
+	RAFT_DUPLICATEADDRESS, /* Server address already in use */
+	RAFT_BADROLE,          /* Server role is not valid */
+	RAFT_MALFORMED,
+	RAFT_NOTLEADER,
+	RAFT_LEADERSHIPLOST,
+	RAFT_SHUTDOWN,
+	RAFT_CANTBOOTSTRAP,
+	RAFT_CANTCHANGE,
+	RAFT_CORRUPT,
+	RAFT_CANCELED,
+	RAFT_NAMETOOLONG,
+	RAFT_TOOBIG,
+	RAFT_NOCONNECTION,
+	RAFT_BUSY,
+	RAFT_IOERR,        /* File system or storage error */
+	RAFT_NOTFOUND,     /* Resource not found */
+	RAFT_INVALID,      /* Invalid parameter */
+	RAFT_UNAUTHORIZED, /* No access to a resource */
+	RAFT_NOSPACE,      /* Not enough space on disk */
+	RAFT_TOOMANY       /* Some system or raft limit was hit */
+};
+
+/**
+ * Size of human-readable error message buffers.
+ */
+#define RAFT_ERRMSG_BUF_SIZE 256
+
+/**
+ * Return the error message describing the given error code.
+ */
+RAFT_API const char *raft_strerror(int errnum);
+
+typedef unsigned long long raft_id;
+
+/**
+ * Hold the value of a raft term. Guaranteed to be at least 64-bit long.
+ */
+typedef unsigned long long raft_term;
+
+/**
+ * Hold the value of a raft entry index. Guaranteed to be at least 64-bit long.
+ */
+typedef unsigned long long raft_index;
+
+/**
+ * Hold a time value expressed in milliseconds since the epoch.
+ */
+typedef unsigned long long raft_time;
+
+/**
+ * Hold the features a raft node is capable of.
+ */
+typedef uint64_t raft_flags;
+
+/**
+ * A data buffer.
+ */
+struct raft_buffer
+{
+	void *base; /* Pointer to the buffer data. */
+	size_t len; /* Length of the buffer. */
+};
+
+/**
+ * Server role codes.
+ */
+enum {
+	RAFT_STANDBY, /* Replicate log, does not participate in quorum. */
+	RAFT_VOTER,   /* Replicate log, does participate in quorum. */
+	RAFT_SPARE    /* Does not replicate log, or participate in quorum. */
+};
+
+/**
+ * Hold information about a single server in the cluster configuration.
+ * WARNING: This struct is encoded/decoded, be careful when adapting it.
+ */
+struct raft_server
+{
+	raft_id id;    /* Server ID, must be greater than zero. */
+	char *address; /* Server address. User defined. */
+	int role;      /* Server role. */
+};
+
+/**
+ * Hold information about all servers currently part of the cluster.
+ * WARNING: This struct is encoded/decoded, be careful when adapting it.
+ */
+struct raft_configuration
+{
+	struct raft_server
+	    *servers; /* Array of servers member of the cluster. */
+	unsigned n;   /* Number of servers in the array. */
+};
+
+/**
+ * Initialize an empty raft configuration.
+ */
+RAFT_API void raft_configuration_init(struct raft_configuration *c);
+
+/**
+ * Release all memory used by the given configuration object.
+ */
+RAFT_API void raft_configuration_close(struct raft_configuration *c);
+
+/**
+ * Add a server to a raft configuration.
+ *
+ * The @id must be greater than zero and @address point to a valid string.
+ *
+ * The @role must be either #RAFT_VOTER, #RAFT_STANDBY, #RAFT_SPARE.
+ *
+ * If @id or @address are already in use by another server in the configuration,
+ * an error is returned.
+ *
+ * The @address string will be copied and can be released after this function
+ * returns.
+ */
+RAFT_API int raft_configuration_add(struct raft_configuration *c,
+				    raft_id id,
+				    const char *address,
+				    int role);
+
+/**
+ * Encode the given configuration object.
+ *
+ * The memory of the returned buffer is allocated using raft_malloc(), and
+ * client code is responsible for releasing it when no longer needed.
+ */
+RAFT_API int raft_configuration_encode(const struct raft_configuration *c,
+				       struct raft_buffer *buf);
+
+/**
+ * Hash function which outputs a 64-bit value based on a text and a number.
+ *
+ * This can be used to generate a unique ID for a new server being added, for
+ * example based on its address and on the current time in milliseconds since
+ * the Epoch.
+ *
+ * It's internally implemented as a SHA1 where only the last 8 bytes of the hash
+ * value are kept.
+ */
+RAFT_API unsigned long long raft_digest(const char *text, unsigned long long n);
+
+/**
+ * Log entry types.
+ */
+enum {
+	RAFT_COMMAND = 1, /* Command for the application FSM. */
+	RAFT_BARRIER,     /* Wait for all previous commands to be applied. */
+	RAFT_CHANGE       /* Raft configuration change. */
+};
+
+/**
+ * A single entry in the raft log.
+ *
+ * An entry that originated from this raft instance while it was the leader
+ * (typically via client calls to raft_apply()) should normally have a @buf
+ * attribute referencing directly the memory that was originally allocated by
+ * the client itself to contain the entry data, and the @batch attribute set to
+ * #NULL.
+ *
+ * An entry that was received from the network as part of an AppendEntries RPC
+ * or that was loaded from disk at startup should normally have a @batch
+ * attribute that points to a contiguous chunk of memory that contains the data
+ * of the entry itself plus possibly the data for other entries that were
+ * received or loaded with it at the same time. In this case the @buf pointer
+ * will be equal to the @batch pointer plus an offset, that locates the position
+ * of the entry's data within the batch.
+ *
+ * When the @batch attribute is not #NULL the raft library will take care of
+ * releasing that memory only once there are no more references to the
+ * associated entries.
+ *
+ * This arrangement makes it possible to minimize the amount of memory-copying
+ * when performing I/O.
+ */
+struct raft_entry
+{
+	raft_term term;      /* Term in which the entry was created. */
+	unsigned short type; /* Type (FSM command, barrier, config change). */
+	struct raft_buffer buf; /* Entry data. */
+	void *batch;            /* Batch that buf's memory points to, if any. */
+};
+
+/**
+ * Hold the arguments of a RequestVote RPC.
+ *
+ * The RequestVote RPC is invoked by candidates to gather votes.
+ */
+struct raft_request_vote
+{
+	int version;
+	raft_term term;            /* Candidate's term. */
+	raft_id candidate_id;      /* ID of the server requesting the vote. */
+	raft_index last_log_index; /* Index of candidate's last log entry. */
+	raft_index last_log_term;  /* Term of log entry at last_log_index. */
+	bool disrupt_leader; /* True if current leader should be discarded. */
+	bool pre_vote;       /* True if this is a pre-vote request. */
+};
+#define RAFT_REQUEST_VOTE_VERSION 2
+
+/**
+ * Hold the result of a RequestVote RPC.
+ */
+struct raft_request_vote_result
+{
+	int version;
+	raft_term
+	    term; /* Receiver's current term (candidate updates itself). */
+	bool vote_granted; /* True means candidate received vote. */
+	bool pre_vote;     /* The response to a pre-vote RequestVote or not. */
+};
+#define RAFT_REQUEST_VOTE_RESULT_VERSION 2
+
+/**
+ * Hold the arguments of an AppendEntries RPC.
+ *
+ * The AppendEntries RPC is invoked by the leader to replicate log entries. It's
+ * also used as heartbeat (figure 3.1).
+ */
+struct raft_append_entries
+{
+	int version;
+	raft_term term;            /* Leader's term. */
+	raft_index prev_log_index; /* Index of log entry preceeding new ones. */
+	raft_term prev_log_term;   /* Term of entry at prev_log_index. */
+	raft_index leader_commit;  /* Leader's commit index. */
+	struct raft_entry *entries; /* Log entries to append. */
+	unsigned n_entries;         /* Size of the log entries array. */
+};
+#define RAFT_APPEND_ENTRIES_VERSION 0
+
+/**
+ * Hold the result of an AppendEntries RPC (figure 3.1).
+ */
+struct raft_append_entries_result
+{
+	int version;
+	raft_term term;      /* Receiver's current_term. */
+	raft_index rejected; /* If non-zero, the index that was rejected. */
+	raft_index
+	    last_log_index;  /* Receiver's last log entry index, as hint. */
+	raft_flags features; /* Feature flags. */
+};
+#define RAFT_APPEND_ENTRIES_RESULT_VERSION 1
+
+/**
+ * Hold the arguments of an InstallSnapshot RPC (figure 5.3).
+ */
+struct raft_install_snapshot
+{
+	int version;
+	raft_term term;        /* Leader's term. */
+	raft_index last_index; /* Index of last entry in the snapshot. */
+	raft_term last_term;   /* Term of last_index. */
+	struct raft_configuration conf; /* Config as of last_index. */
+	raft_index conf_index;          /* Commit index of conf. */
+	struct raft_buffer data;        /* Raw snapshot data. */
+};
+#define RAFT_INSTALL_SNAPSHOT_VERSION 0
+
+/**
+ * Hold the arguments of a TimeoutNow RPC.
+ *
+ * The TimeoutNow RPC is invoked by leaders to transfer leadership to a
+ * follower.
+ */
+struct raft_timeout_now
+{
+	int version;
+	raft_term term;            /* Leader's term. */
+	raft_index last_log_index; /* Index of leader's last log entry. */
+	raft_index last_log_term;  /* Term of log entry at last_log_index. */
+};
+#define RAFT_TIMEOUT_NOW_VERSION 0
+
+/**
+ * Type codes for RPC messages.
+ */
+enum {
+	RAFT_IO_APPEND_ENTRIES = 1,
+	RAFT_IO_APPEND_ENTRIES_RESULT,
+	RAFT_IO_REQUEST_VOTE,
+	RAFT_IO_REQUEST_VOTE_RESULT,
+	RAFT_IO_INSTALL_SNAPSHOT,
+	RAFT_IO_TIMEOUT_NOW
+};
+
+/**
+ * A single RPC message that can be sent or received over the network.
+ *
+ * The RPC message types all have a `version` field.
+ * In the libuv io implementation, `version` is filled out during decoding
+ * and is based on the size of the message on the wire, see e.g.
+ * `sizeofRequestVoteV1`. The version number in the RAFT_MESSAGE_XXX_VERSION
+ * macro needs to be bumped every time the message is updated.
+ *
+ * Notes when adding a new message type to raft:
+ * raft_io implementations compiled against old versions of raft don't know the
+ * new message type and possibly have not allocated enough space for it. When
+ * such an application receives a new message over the wire, the raft_io
+ * implementation will err out or drop the message, because it doesn't know how
+ * to decode it based on its type.
+ * raft_io implementations compiled against versions of raft that know the new
+ * message type but at runtime are linked against an older raft lib, will pass
+ * the message to raft, where raft will drop it.
+ * When raft receives a message and accesses a field of a new message type,
+ * the raft_io implementation must have known about the new message type,
+ * so it was compiled against a modern enough version of raft, and memory
+ * accesses should be safe.
+ *
+ * Sending a new message type with a raft_io implementation that doesn't know
+ * the type is safe, the implementation should drop the message based on its
+ * type and will not try to access fields it doesn't know the existence of.
+ */
+struct raft_message
+{
+	unsigned short type; /* RPC type code. */
+	raft_id server_id;   /* ID of sending or destination server. */
+	const char
+	    *server_address; /* Address of sending or destination server. */
+	union {              /* Type-specific data */
+		struct raft_request_vote request_vote;
+		struct raft_request_vote_result request_vote_result;
+		struct raft_append_entries append_entries;
+		struct raft_append_entries_result append_entries_result;
+		struct raft_install_snapshot install_snapshot;
+		struct raft_timeout_now timeout_now;
+	};
+};
+
+/**
+ * Hold the details of a snapshot.
+ * The user-provided raft_buffer structs should provide the user with enough
+ * flexibility to adapt/evolve snapshot formats.
+ * If this struct would NEED to be adapted in the future, raft can always move
+ * to a new struct with a new name and a new raft_io version.
+ */
+struct raft_snapshot
+{
+	/* Index and term of last entry included in the snapshot. */
+	raft_index index;
+	raft_term term;
+
+	/* Last committed configuration included in the snapshot, along with the
+	 * index it was committed at. */
+	struct raft_configuration configuration;
+	raft_index configuration_index;
+
+	/* Content of the snapshot. When a snapshot is taken, the user FSM can
+	 * fill the bufs array with more than one buffer. When a snapshot is
+	 * restored, there will always be a single buffer. */
+	struct raft_buffer *bufs;
+	unsigned n_bufs;
+};
+
+/**
+ * Asynchronous request to send an RPC message.
+ */
+struct raft_io_send;
+typedef void (*raft_io_send_cb)(struct raft_io_send *req, int status);
+struct raft_io_send
+{
+	void *data;         /* User data */
+	raft_io_send_cb cb; /* Request callback */
+};
+
+/**
+ * Asynchronous request to store new log entries.
+ */
+struct raft_io_append;
+typedef void (*raft_io_append_cb)(struct raft_io_append *req, int status);
+struct raft_io_append
+{
+	void *data;           /* User data */
+	raft_io_append_cb cb; /* Request callback */
+};
+
+/**
+ * Asynchronous request to store a new snapshot.
+ */
+struct raft_io_snapshot_put;
+typedef void (*raft_io_snapshot_put_cb)(struct raft_io_snapshot_put *req,
+					int status);
+struct raft_io_snapshot_put
+{
+	void *data;                 /* User data */
+	raft_io_snapshot_put_cb cb; /* Request callback */
+};
+
+/**
+ * Asynchronous request to load the most recent snapshot available.
+ */
+struct raft_io_snapshot_get;
+typedef void (*raft_io_snapshot_get_cb)(struct raft_io_snapshot_get *req,
+					struct raft_snapshot *snapshot,
+					int status);
+struct raft_io_snapshot_get
+{
+	void *data;                 /* User data */
+	raft_io_snapshot_get_cb cb; /* Request callback */
+};
+
+/**
+ * Asynchronous work request.
+ */
+struct raft_io_async_work;
+typedef int (*raft_io_async_work_fn)(struct raft_io_async_work *req);
+typedef void (*raft_io_async_work_cb)(struct raft_io_async_work *req,
+				      int status);
+struct raft_io_async_work
+{
+	void *data; /* User data */
+	raft_io_async_work_fn
+	    work;                 /* Function to run async from the main loop */
+	raft_io_async_work_cb cb; /* Request callback */
+};
+
+/**
+ * Customizable tracer, for debugging purposes.
+ */
+struct raft_tracer
+{
+	/**
+	 * Implementation-defined state object.
+	 */
+	void *impl;
+
+	/**
+	 * Whether this tracer should emit messages.
+	 */
+	bool enabled;
+
+	/**
+	 * Trace level.
+	 */
+	unsigned level;
+
+	/**
+	 * Emit the given trace message, possibly decorating it with the
+	 * provided metadata.
+	 */
+	void (*emit)(struct raft_tracer *t,
+		     const char *file,
+		     unsigned int line,
+		     const char *func,
+		     unsigned int level,
+		     const char *message);
+};
+
+struct raft_io; /* Forward declaration. */
+
+/**
+ * Callback invoked by the I/O implementation at regular intervals.
+ */
+typedef void (*raft_io_tick_cb)(struct raft_io *io);
+
+/**
+ * Callback invoked by the I/O implementation when an RPC message is received.
+ */
+typedef void (*raft_io_recv_cb)(struct raft_io *io, struct raft_message *msg);
+
+typedef void (*raft_io_close_cb)(struct raft_io *io);
+
+/**
+ * version field MUST be filled out by user.
+ * When moving to a new version, the user MUST implement the newly added
+ * methods.
+ */
+struct raft_io
+{
+	int version; /* 1 or 2 */
+	void *data;
+	void *impl;
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int (*init)(struct raft_io *io, raft_id id, const char *address);
+	void (*close)(struct raft_io *io, raft_io_close_cb cb);
+	int (*load)(struct raft_io *io,
+		    raft_term *term,
+		    raft_id *voted_for,
+		    struct raft_snapshot **snapshot,
+		    raft_index *start_index,
+		    struct raft_entry *entries[],
+		    size_t *n_entries);
+	int (*start)(struct raft_io *io,
+		     unsigned msecs,
+		     raft_io_tick_cb tick,
+		     raft_io_recv_cb recv);
+	int (*bootstrap)(struct raft_io *io,
+			 const struct raft_configuration *conf);
+	int (*recover)(struct raft_io *io,
+		       const struct raft_configuration *conf);
+	int (*set_term)(struct raft_io *io, raft_term term);
+	int (*set_vote)(struct raft_io *io, raft_id server_id);
+	int (*send)(struct raft_io *io,
+		    struct raft_io_send *req,
+		    const struct raft_message *message,
+		    raft_io_send_cb cb);
+	int (*append)(struct raft_io *io,
+		      struct raft_io_append *req,
+		      const struct raft_entry entries[],
+		      unsigned n,
+		      raft_io_append_cb cb);
+	int (*truncate)(struct raft_io *io, raft_index index);
+	int (*snapshot_put)(struct raft_io *io,
+			    unsigned trailing,
+			    struct raft_io_snapshot_put *req,
+			    const struct raft_snapshot *snapshot,
+			    raft_io_snapshot_put_cb cb);
+	int (*snapshot_get)(struct raft_io *io,
+			    struct raft_io_snapshot_get *req,
+			    raft_io_snapshot_get_cb cb);
+	raft_time (*time)(struct raft_io *io);
+	int (*random)(struct raft_io *io, int min, int max);
+	/* Field(s) below added since version 2. */
+	int (*async_work)(struct raft_io *io,
+			  struct raft_io_async_work *req,
+			  raft_io_async_work_cb cb);
+};
+
+/**
+ * version field MUST be filled out by user.
+ * When moving to a new version, the user MUST initialize the new methods,
+ * either with an implementation or with NULL.
+ *
+ * version 2:
+ * introduces `snapshot_finalize`, when this method is not NULL, it will
+ * always run after a successful call to `snapshot`, whether the snapshot has
+ * been successfully written to disk or not. If it is set, raft will
+ * assume no ownership of any of the `raft_buffer`s and the responsibility to
+ * clean up lies with the user of raft.
+ * `snapshot_finalize` can be used to e.g. release a lock that was taken during
+ * a call to `snapshot`. Until `snapshot_finalize` is called, raft can access
+ * the data contained in the `raft_buffer`s.
+ *
+ * version 3:
+ * Adds support for async snapshots through the `snapshot_async` function.
+ * When this method is provided, raft will call `snapshot` in the main loop,
+ * and when successful, will call `snapshot_async` using the `io->async_work`
+ * method, so blocking I/O calls are allowed in the implementation. After the
+ * `snapshot_async` completes, `snapshot_finalize` will be called in the main
+ * loop, independent of the return value of `snapshot_async`.
+ * An implementation that does not use asynchronous snapshots MUST set
+ * `snapshot_async` to NULL.
+ * All memory allocated by the snapshot routines MUST be freed by the snapshot
+ * routines themselves.
+ */
+
+struct raft_fsm
+{
+	int version; /* 1, 2 or 3 */
+	void *data;
+	int (*apply)(struct raft_fsm *fsm,
+		     const struct raft_buffer *buf,
+		     void **result);
+	int (*snapshot)(struct raft_fsm *fsm,
+			struct raft_buffer *bufs[],
+			unsigned *n_bufs);
+	int (*restore)(struct raft_fsm *fsm, struct raft_buffer *buf);
+	/* Fields below added since version 2. */
+	int (*snapshot_finalize)(struct raft_fsm *fsm,
+				 struct raft_buffer *bufs[],
+				 unsigned *n_bufs);
+	/* Fields below added since version 3. */
+	int (*snapshot_async)(struct raft_fsm *fsm,
+			      struct raft_buffer *bufs[],
+			      unsigned *n_bufs);
+};
+
+struct raft; /* Forward declaration. */
+
+/**
+ * State codes.
+ */
+enum { RAFT_UNAVAILABLE, RAFT_FOLLOWER, RAFT_CANDIDATE, RAFT_LEADER };
+
+/**
+ * State callback to invoke if raft's state changes.
+ */
+typedef void (*raft_state_cb)(struct raft *raft,
+			      unsigned short old_state,
+			      unsigned short new_state);
+
+struct raft_progress;
+
+/**
+ * Close callback.
+ *
+ * It's safe to release the memory of a raft instance only after this callback
+ * has fired.
+ */
+typedef void (*raft_close_cb)(struct raft *raft);
+
+struct raft_change;   /* Forward declaration */
+struct raft_transfer; /* Forward declaration */
+
+struct raft_log;
+
+/**
+ * Hold and drive the state of a single raft server in a cluster.
+ * When replacing reserved fields in the middle of this struct, you MUST use a
+ * type with the same size and alignment requirements as the original type.
+ */
+struct raft
+{
+	void *data;                 /* Custom user data. */
+	struct raft_tracer *tracer; /* Tracer implementation. */
+	struct raft_io *io;         /* Disk and network I/O implementation. */
+	struct raft_fsm *fsm;       /* User-defined FSM to apply commands to. */
+	raft_id id;                 /* Server ID of this raft instance. */
+	char *address;              /* Server address of this raft instance. */
+
+	/*
+	 * Cache of the server's persistent state, updated on stable storage
+	 * before responding to RPCs (Figure 3.1).
+	 */
+	raft_term current_term; /* Latest term server has seen. */
+	raft_id voted_for; /* Candidate that received vote in current term. */
+	struct raft_log *log; /* Log entries. */
+
+	/*
+	 * Current membership configuration (Chapter 4).
+	 *
+	 * At any given moment the current configuration can be committed or
+	 * uncommitted.
+	 *
+	 * If a server is voting, the log entry with index 1 must always contain
+	 * the first committed configuration.
+	 *
+	 * At all times #configuration_committed_index is either zero or is the
+	 * index of the most recent log entry of type #RAFT_CHANGE that we know
+	 * to be committed. That means #configuration_committed_index is always
+	 * equal or lower than #commit_index.
+	 *
+	 * At all times #configuration_uncommitted_index is either zero or is
+	 * the index of an uncommitted log entry of type #RAFT_CHANGE. There can
+	 * be at most one uncommitted entry of type #RAFT_CHANGE because we
+	 * allow only one configuration change at a time.
+	 *
+	 * At all times #configuration_last_snapshot is a copy of the
+	 * configuration contained the most recent snapshot, if any.
+	 *
+	 * The possible scenarios are:
+	 *
+	 * 1. #configuration_committed_index and
+	 * #configuration_uncommitted_index are both zero. This should only
+	 * happen when a brand new server starts joining a cluster and is
+	 * waiting to receive log entries from the current leader. In this case
+	 * #configuration and #configuration_last_snapshot must be empty and
+	 * have no servers.
+	 *
+	 * 2. #configuration_committed_index is non-zero and
+	 *    #configuration_uncommitted_index is zero. This means that
+	 *    #configuration is committed and there is no pending configuration
+	 *    change. The content of #configuration must match the one of the
+	 * log entry at #configuration_committed_index.
+	 *
+	 * 3. #configuration_committed_index and
+	 * #configuration_uncommitted_index are both non-zero, with the latter
+	 * being greater than the former. This means that #configuration is
+	 * uncommitted and represents a pending configuration change. The
+	 * content of #configuration must match the one of the log entry at
+	 * #configuration_uncommitted_index.
+	 *
+	 * When a snapshot is taken, a copy of the most recent configuration
+	 * known to be committed (i.e. the configuration contained in the log
+	 * entry at #configuration_committed_index) is saved in
+	 * #configuration_last_snapshot, so it can be easily retrieved in case
+	 * the log gets truncated because of compaction and does not contain the
+	 * entry at #configuration_committed_index anymore. Likewise, if a
+	 * snapshot is restored its associated configuration is saved in
+	 * #configuration_last_snapshot.
+	 */
+	struct raft_configuration configuration;
+	struct raft_configuration configuration_last_snapshot;
+	raft_index configuration_committed_index;
+	raft_index configuration_uncommitted_index;
+
+	/*
+	 * Election timeout in milliseconds (default 1000).
+	 *
+	 * From 3.4:
+	 *
+	 *   Raft uses a heartbeat mechanism to trigger leader election. When
+	 *   servers start up, they begin as followers. A server remains in
+	 * follower state as long as it receives valid RPCs from a leader or
+	 *   candidate. Leaders send periodic heartbeats (AppendEntries RPCs
+	 * that carry no log entries) to all followers in order to maintain
+	 * their authority. If a follower receives no communication over a
+	 * period of time called the election timeout, then it assumes there is
+	 * no viable leader and begins an election to choose a new leader.
+	 *
+	 * This is the baseline value and will be randomized between 1x and 2x.
+	 *
+	 * See raft_change_election_timeout() to customize the value of this
+	 * attribute.
+	 */
+	unsigned election_timeout;
+
+	/*
+	 * Heartbeat timeout in milliseconds (default 100). This is relevant
+	 * only for when the raft instance is in leader state: empty
+	 * AppendEntries RPCs will be sent if this amount of milliseconds
+	 * elapses without any user-triggered AppendEntries RCPs being sent.
+	 *
+	 * From Figure 3.1:
+	 *
+	 *   [Leaders] Send empty AppendEntries RPC during idle periods to
+	 * prevent election timeouts.
+	 */
+	unsigned heartbeat_timeout;
+
+	/*
+	 * When the leader sends an InstallSnapshot RPC to a follower it will
+	 * consider the RPC as failed after this timeout and retry.
+	 */
+	unsigned install_snapshot_timeout;
+
+	/*
+	 * The fields below hold the part of the server's volatile state which
+	 * is always applicable regardless of the whether the server is
+	 * follower, candidate or leader (Figure 3.1). This state is rebuilt
+	 * automatically after a server restart.
+	 */
+	raft_index commit_index; /* Highest log entry known to be committed */
+	raft_index last_applied; /* Highest log entry applied to the FSM */
+	raft_index last_stored;  /* Highest log entry persisted on disk */
+
+	/*
+	 * Current server state of this raft instance, along with a union
+	 * defining state-specific values.
+	 */
+	unsigned short state;
+	union {
+		struct /* Follower */
+		{
+			unsigned
+			    randomized_election_timeout; /* Timer expiration. */
+			struct /* Current leader info. */
+			{
+				raft_id id;
+				char *address;
+			} current_leader;
+			uint64_t append_in_flight_count;
+			uint64_t reserved[7]; /* Future use */
+		} follower_state;
+		struct
+		{
+			unsigned
+			    randomized_election_timeout; /* Timer expiration. */
+			bool *votes;                     /* Vote results. */
+			bool disrupt_leader;  /* For leadership transfer */
+			bool in_pre_vote;     /* True in pre-vote phase. */
+			uint64_t reserved[8]; /* Future use */
+		} candidate_state;
+		struct
+		{
+			struct raft_progress
+			    *progress; /* Per-server replication state. */
+			struct raft_change
+			    *change;         /* Pending membership change. */
+			raft_id promotee_id; /* ID of server being promoted. */
+			unsigned short round_number; /* Current sync round. */
+			raft_index
+			    round_index; /* Target of the current round. */
+			raft_time round_start; /* Start of current round. */
+			void *requests[2]; /* Outstanding client requests. */
+			uint32_t
+			    voter_contacts; /* Current number of voting nodes we
+					       are in contact with */
+			uint32_t reserved2; /* Future use */
+			uint64_t reserved[7]; /* Future use */
+		} leader_state;
+	};
+
+	/* Election timer start.
+	 *
+	 * This timer has different purposes depending on the state. Followers
+	 * convert to candidate after the randomized election timeout has
+	 * elapsed without leader contact. Candidates start a new election after
+	 * the randomized election timeout has elapsed without a winner. Leaders
+	 * step down after the election timeout has elapsed without contacting a
+	 * majority of voting servers. */
+	raft_time election_timer_start;
+
+	/* In-progress leadership transfer request, if any. */
+	struct raft_transfer *transfer;
+
+	/*
+	 * Information about the last snapshot that was taken (if any).
+	 */
+	struct
+	{
+		unsigned threshold; /* N. of entries before snapshot */
+		unsigned trailing;  /* N. of trailing entries to retain */
+		struct raft_snapshot pending;    /* In progress snapshot */
+		struct raft_io_snapshot_put put; /* Store snapshot request */
+		uint64_t reserved[8];            /* Future use */
+	} snapshot;
+
+	/*
+	 * Callback to invoke once a close request has completed.
+	 */
+	raft_close_cb close_cb;
+
+	/*
+	 * Human-readable message providing diagnostic information about the
+	 * last error occurred.
+	 */
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+
+	/* Whether to use pre-vote to avoid disconnected servers disrupting the
+	 * current leader, as described in 4.2.3 and 9.6. */
+	bool pre_vote;
+
+	/* Limit how long to wait for a stand-by to catch-up with the log when
+	 * its being promoted to voter. */
+	unsigned max_catch_up_rounds;
+	unsigned max_catch_up_round_duration;
+
+	/* uint64_t because we used a reserved field. In reality this a pointer
+	 * to a `struct raft_callbacks` that can be used to store e.g. various
+	 * user-supplied callbacks. */
+	uint64_t callbacks;
+
+	/* Future extensions */
+	uint64_t reserved[31];
+};
+
+RAFT_API int raft_init(struct raft *r,
+		       struct raft_io *io,
+		       struct raft_fsm *fsm,
+		       raft_id id,
+		       const char *address);
+
+RAFT_API void raft_close(struct raft *r, raft_close_cb cb);
+
+/**
+ * This function MUST be called after raft_init and before raft_start.
+ * @cb will be called every time the raft state changes.
+ */
+RAFT_API void raft_register_state_cb(struct raft *r, raft_state_cb cb);
+
+/**
+ * Bootstrap this raft instance using the given configuration. The instance must
+ * not have been started yet and must be completely pristine, otherwise
+ * #RAFT_CANTBOOTSTRAP will be returned.
+ */
+RAFT_API int raft_bootstrap(struct raft *r,
+			    const struct raft_configuration *conf);
+
+/**
+ * Force a new configuration in order to recover from a loss of quorum where the
+ * current configuration cannot be restored, such as when a majority of servers
+ * die at the same time.
+ *
+ * This works by appending the new configuration directly to the log stored on
+ * disk.
+ *
+ * In order for this operation to be safe you must follow these steps:
+ *
+ * 1. Make sure that no servers in the cluster are running, either because they
+ *    died or because you manually stopped them.
+ *
+ * 2. Run @raft_recover exactly one time, on the non-dead server which has
+ *    the highest term and the longest log.
+ *
+ * 3. Copy the data directory of the server you ran @raft_recover on to all
+ *    other non-dead servers in the cluster, replacing their current data
+ *    directory.
+ *
+ * 4. Restart all servers.
+ */
+RAFT_API int raft_recover(struct raft *r,
+			  const struct raft_configuration *conf);
+
+RAFT_API int raft_start(struct raft *r);
+
+/**
+ * Set the election timeout.
+ *
+ * Every raft instance is initialized with a default election timeout of 1000
+ * milliseconds. If you wish to tweak it, call this function before starting
+ * your event loop.
+ *
+ * From Chapter 9:
+ *
+ *   We recommend a range that is 10-20 times the one-way network latency, which
+ *   keeps split votes rates under 40% in all cases for reasonably sized
+ *   clusters, and typically results in much lower rates.
+ *
+ * Note that the current random election timer will be reset and a new one timer
+ * will be generated.
+ */
+RAFT_API void raft_set_election_timeout(struct raft *r, unsigned msecs);
+
+/**
+ * Set the heartbeat timeout.
+ */
+RAFT_API void raft_set_heartbeat_timeout(struct raft *r, unsigned msecs);
+
+/**
+ * Set the snapshot install timeout.
+ */
+RAFT_API void raft_set_install_snapshot_timeout(struct raft *r, unsigned msecs);
+
+/**
+ * Number of outstanding log entries before starting a new snapshot. The default
+ * is 1024.
+ */
+RAFT_API void raft_set_snapshot_threshold(struct raft *r, unsigned n);
+
+/**
+ * Enable or disable pre-vote support. Pre-vote is turned off by default.
+ */
+RAFT_API void raft_set_pre_vote(struct raft *r, bool enabled);
+
+/**
+ * Number of outstanding log entries to keep in the log after a snapshot has
+ * been taken. This avoids sending snapshots when a follower is behind by just a
+ * few entries. The default is 128.
+ */
+RAFT_API void raft_set_snapshot_trailing(struct raft *r, unsigned n);
+
+/**
+ * Set the maximum number of a catch-up rounds to try when replicating entries
+ * to a stand-by server that is being promoted to voter, before giving up and
+ * failing the configuration change. The default is 10.
+ */
+RAFT_API void raft_set_max_catch_up_rounds(struct raft *r, unsigned n);
+
+/**
+ * Set the maximum duration of a catch-up round when replicating entries to a
+ * stand-by server that is being promoted to voter. The default is 5 seconds.
+ */
+RAFT_API void raft_set_max_catch_up_round_duration(struct raft *r,
+						   unsigned msecs);
+
+/**
+ * Return a human-readable description of the last error occurred.
+ */
+RAFT_API const char *raft_errmsg(struct raft *r);
+
+/**
+ * Return the code of the current raft state (follower/candidate/leader).
+ */
+RAFT_API int raft_state(struct raft *r);
+
+/**
+ * Return the code of the current raft role (spare/standby/voter),
+ * or -1 if this server is not in the current configuration.
+ */
+RAFT_API int raft_role(struct raft *r);
+
+/**
+ * Return the ID and address of the current known leader, if any.
+ */
+RAFT_API void raft_leader(struct raft *r, raft_id *id, const char **address);
+
+/**
+ * Return the index of the last entry that was appended to the local log.
+ */
+RAFT_API raft_index raft_last_index(struct raft *r);
+
+/**
+ * Return the index of the last entry that was applied to the local FSM.
+ */
+RAFT_API raft_index raft_last_applied(struct raft *r);
+
+/**
+ * Return the number of voting servers that the leader has recently been in
+ * contact with. This can be used to help determine whether the cluster may be
+ * in a degraded/at risk state.
+ *
+ * Returns valid values >= 1, because a leader is always in contact with
+ * itself.
+ * Returns -1 if called on a follower.
+ *
+ * Note that the value returned may be out of date, and so should not be relied
+ * upon for absolute correctness.
+ */
+RAFT_API int raft_voter_contacts(struct raft *r);
+
+/**
+ * Common fields across client request types.
+ * `req_id`, `client_id` and `unique_id` are currently unused.
+ * `reserved` fields should be replaced by new members with the same size
+ * and alignment requirements as `uint64_t`.
+ */
+#define RAFT__REQUEST          \
+	void *data;            \
+	int type;              \
+	raft_index index;      \
+	void *queue[2];        \
+	uint8_t req_id[16];    \
+	uint8_t client_id[16]; \
+	uint8_t unique_id[16]; \
+	uint64_t reserved[4]
+
+/**
+ * Asynchronous request to append a new command entry to the log and apply it to
+ * the FSM when a quorum is reached.
+ */
+struct raft_apply;
+typedef void (*raft_apply_cb)(struct raft_apply *req, int status, void *result);
+struct raft_apply
+{
+	RAFT__REQUEST;
+	raft_apply_cb cb;
+};
+
+/**
+ * Propose to append commands to the log and apply them to the FSM once
+ * committed.
+ *
+ * If this server is the leader, it will create @n new log entries of type
+ * #RAFT_COMMAND using the given buffers as their payloads, append them to its
+ * own log and attempt to replicate them on other servers by sending
+ * AppendEntries RPCs.
+ *
+ * The memory pointed at by the @base attribute of each #raft_buffer in the
+ * given array must have been allocated with raft_malloc() or a compatible
+ * allocator. If this function returns 0, the ownership of this memory is
+ * implicitly transferred to the raft library, which will take care of releasing
+ * it when appropriate. Any further client access to such memory leads to
+ * undefined behavior.
+ *
+ * The ownership of the memory of the @bufs array itself is not transferred to
+ * the raft library, and, if allocated dynamically, must be deallocated by the
+ * caller.
+ *
+ * If the command was successfully applied, r->last_applied will be equal to
+ * the log entry index of the applied command when the cb is invoked.
+ */
+RAFT_API int raft_apply(struct raft *r,
+			struct raft_apply *req,
+			const struct raft_buffer bufs[],
+			const unsigned n,
+			raft_apply_cb cb);
+
+/**
+ * Asynchronous request to append a barrier entry.
+ */
+struct raft_barrier;
+typedef void (*raft_barrier_cb)(struct raft_barrier *req, int status);
+struct raft_barrier
+{
+	RAFT__REQUEST;
+	raft_barrier_cb cb;
+};
+
+/**
+ * Propose to append a log entry of type #RAFT_BARRIER.
+ *
+ * This can be used to ensure that there are no unapplied commands.
+ */
+RAFT_API int raft_barrier(struct raft *r,
+			  struct raft_barrier *req,
+			  raft_barrier_cb cb);
+
+/**
+ * Asynchronous request to change the raft configuration.
+ */
+typedef void (*raft_change_cb)(struct raft_change *req, int status);
+struct raft_change
+{
+	RAFT__REQUEST;
+	raft_change_cb cb;
+};
+
+/**
+ * Add a new server to the cluster configuration. Its initial role will be
+ * #RAFT_SPARE.
+ */
+RAFT_API int raft_add(struct raft *r,
+		      struct raft_change *req,
+		      raft_id id,
+		      const char *address,
+		      raft_change_cb cb);
+
+/**
+ * Assign a new role to the given server.
+ *
+ * If the server has already the given role, or if the given role is unknown,
+ * #RAFT_BADROLE is returned.
+ */
+RAFT_API int raft_assign(struct raft *r,
+			 struct raft_change *req,
+			 raft_id id,
+			 int role,
+			 raft_change_cb cb);
+
+/**
+ * Remove the given server from the cluster configuration.
+ */
+RAFT_API int raft_remove(struct raft *r,
+			 struct raft_change *req,
+			 raft_id id,
+			 raft_change_cb cb);
+
+/**
+ * Asynchronous request to transfer leadership.
+ */
+typedef void (*raft_transfer_cb)(struct raft_transfer *req);
+struct raft_transfer
+{
+	RAFT__REQUEST;
+	raft_id id;               /* ID of target server. */
+	raft_time start;          /* Start of leadership transfer. */
+	struct raft_io_send send; /* For sending TimeoutNow */
+	raft_transfer_cb cb;      /* User callback */
+};
+
+/**
+ * Transfer leadership to the server with the given ID.
+ *
+ * If the target server is not part of the configuration, or it's the leader
+ * itself, or it's not a #RAFT_VOTER, then #RAFT_BADID is returned.
+ *
+ * The special value #0 means to automatically select a voting follower to
+ * transfer leadership to. If there are no voting followers, return
+ * #RAFT_NOTFOUND.
+ *
+ * When this server detects that the target server has become the leader, or
+ * when @election_timeout milliseconds have elapsed, the given callback will be
+ * invoked.
+ *
+ * After the callback files, clients can check whether the operation was
+ * successful or not by calling @raft_leader() and checking if it returns the
+ * target server.
+ */
+RAFT_API int raft_transfer(struct raft *r,
+			   struct raft_transfer *req,
+			   raft_id id,
+			   raft_transfer_cb cb);
+
+/**
+ * User-definable dynamic memory allocation functions.
+ *
+ * The @data field will be passed as first argument to all functions.
+ */
+struct raft_heap
+{
+	void *data; /* User data */
+	void *(*malloc)(void *data, size_t size);
+	void (*free)(void *data, void *ptr);
+	void *(*calloc)(void *data, size_t nmemb, size_t size);
+	void *(*realloc)(void *data, void *ptr, size_t size);
+	void *(*aligned_alloc)(void *data, size_t alignment, size_t size);
+	void (*aligned_free)(void *data, size_t alignment, void *ptr);
+};
+
+DQLITE_VISIBLE_TO_TESTS void *raft_malloc(size_t size);
+DQLITE_VISIBLE_TO_TESTS void raft_free(void *ptr);
+DQLITE_VISIBLE_TO_TESTS void *raft_calloc(size_t nmemb, size_t size);
+DQLITE_VISIBLE_TO_TESTS void *raft_realloc(void *ptr, size_t size);
+DQLITE_VISIBLE_TO_TESTS void *raft_aligned_alloc(size_t alignment, size_t size);
+DQLITE_VISIBLE_TO_TESTS void raft_aligned_free(size_t alignment, void *ptr);
+
+/**
+ * Use a custom dynamic memory allocator.
+ */
+DQLITE_VISIBLE_TO_TESTS void raft_heap_set(struct raft_heap *heap);
+
+/**
+ * Use the default dynamic memory allocator (from the stdlib). This clears any
+ * custom allocator specified with @raft_heap_set.
+ */
+DQLITE_VISIBLE_TO_TESTS void raft_heap_set_default(void);
+
+/**
+ * Return a reference to the current dynamic memory allocator.
+ *
+ * This is intended for use by applications that want to temporarily replace
+ * and then restore the original allocator, or that want to defer to the
+ * original allocator in some circumstances.
+ *
+ * The behavior of attempting to mutate the default allocator through the
+ * pointer returned by this function, including attempting to deallocate
+ * the backing memory, is undefined.
+ */
+DQLITE_VISIBLE_TO_TESTS const struct raft_heap *raft_heap_get(void);
+
+#undef RAFT__REQUEST
+
+struct raft_uv_transport;
+
+/**
+ * Configure the given @raft_io instance to use a libuv-based I/O
+ * implementation.
+ *
+ * The @dir path will be copied, and its memory can possibly be released once
+ * this function returns.
+ *
+ * Return #RAFT_NAMETOOLONG if @dir exceeds the size of the internal buffer
+ * that should hold it
+ *
+ * Return #RAFT_NOTFOUND if @dir does not exist.
+ *
+ * Return #RAFT_INVALID if @dir exists but it's not a directory.
+ *
+ * The implementation of metadata and log persistency is virtually the same as
+ * the one found in LogCabin [0].
+ *
+ * The disk files consist of metadata files, closed segments, and open
+ * segments. Metadata files are used to track Raft metadata, such as the
+ * server's current term, vote, and log's start index. Segments contain
+ * contiguous entries that are part of the log. Closed segments are never
+ * written to again (but may be renamed and truncated if a suffix of the log is
+ * truncated). Open segments are where newly appended entries go. Once an open
+ * segment reaches the maximum allowed size, it is closed and a new one is used.
+ *
+ * Metadata files are named "metadata1" and "metadata2". The code alternates
+ * between these so that there is always at least one readable metadata file.
+ * On boot, the readable metadata file with the higher version number is used.
+ *
+ * The format of a metadata file is:
+ *
+ * [8 bytes] Format (currently 1).
+ * [8 bytes] Incremental version number.
+ * [8 bytes] Current term.
+ * [8 bytes] ID of server we voted for.
+ *
+ * Closed segments are named by the format string "%lu-%lu" with their
+ * start and end indexes, both inclusive. Closed segments always contain at
+ * least one entry; the end index is always at least as large as the start
+ * index. Closed segment files may occasionally include data past their
+ * filename's end index (these are ignored but a warning is logged). This can
+ * happen if the suffix of the segment is truncated and a crash occurs at an
+ * inopportune time (the segment file is first renamed, then truncated, and a
+ * crash occurs in between).
+ *
+ * Open segments are named by the format string "open-%lu" with a unique
+ * number. These should not exist when the server shuts down cleanly, but they
+ * exist while the server is running and may be left around during a crash.
+ * Open segments either contain entries which come after the last closed
+ * segment or are full of zeros. When the server crashes while appending to an
+ * open segment, the end of that file may be corrupt. We can't distinguish
+ * between a corrupt file and a partially written entry. The code assumes it's
+ * a partially written entry, logs a warning, and ignores it.
+ *
+ * Truncating a suffix of the log will remove all entries that are no longer
+ * part of the log. Truncating a prefix of the log will only remove complete
+ * segments that are before the new log start index. For example, if a
+ * segment has entries 10 through 20 and the prefix of the log is truncated to
+ * start at entry 15, that entire segment will be retained.
+ *
+ * Each segment file starts with a segment header, which currently contains
+ * just an 8-byte version number for the format of that segment. The current
+ * format (version 1) is just a concatenation of serialized entry batches.
+ *
+ * Each batch has the following format:
+ *
+ * [4 bytes] CRC32 checksum of the batch header, little endian.
+ * [4 bytes] CRC32 checksum of the batch data, little endian.
+ * [  ...  ] Batch (as described in @raft_decode_entries_batch).
+ *
+ * [0] https://github.com/logcabin/logcabin/blob/master/Storage/SegmentedLog.h
+ */
+RAFT_API int raft_uv_init(struct raft_io *io,
+			  struct uv_loop_s *loop,
+			  const char *dir,
+			  struct raft_uv_transport *transport);
+
+/**
+ * Release any memory allocated internally.
+ */
+RAFT_API void raft_uv_close(struct raft_io *io);
+
+/**
+ * Set the block size that will be used for direct I/O.
+ *
+ * The default is to automatically detect the appropriate block size.
+ */
+RAFT_API void raft_uv_set_block_size(struct raft_io *io, size_t size);
+
+/**
+ * Set the maximum initial size of newly created open segments.
+ *
+ * If the given size is not a multiple of the block size, the actual size will
+ * be reduced to the closest multiple.
+ *
+ * The default is 8 megabytes.
+ */
+RAFT_API void raft_uv_set_segment_size(struct raft_io *io, size_t size);
+
+/**
+ * Turn snapshot compression on or off.
+ * Returns non-0 on failure, this can e.g. happen when compression is requested
+ * while no suitable compression library is found.
+ *
+ * By default snapshots are compressed if the appropriate libraries are found.
+ */
+RAFT_API int raft_uv_set_snapshot_compression(struct raft_io *io,
+					      bool compressed);
+
+/**
+ * Set how many milliseconds to wait between subsequent retries when
+ * establishing a connection with another server. The default is 1000
+ * milliseconds.
+ */
+RAFT_API void raft_uv_set_connect_retry_delay(struct raft_io *io,
+					      unsigned msecs);
+
+/**
+ * Emit low-level debug messages using the given tracer.
+ */
+RAFT_API void raft_uv_set_tracer(struct raft_io *io,
+				 struct raft_tracer *tracer);
+
+/**
+ * Enable or disable auto-recovery on startup. Default enabled.
+ */
+RAFT_API void raft_uv_set_auto_recovery(struct raft_io *io, bool flag);
+
+/**
+ * Callback invoked by the transport implementation when a new incoming
+ * connection has been established.
+ *
+ * No references to @address must be kept after this function returns.
+ *
+ * Ownership of @stream is transferred to user code, which is responsible of
+ * uv_close()'ing it and then releasing its memory.
+ */
+typedef void (*raft_uv_accept_cb)(struct raft_uv_transport *t,
+				  raft_id id,
+				  const char *address,
+				  struct uv_stream_s *stream);
+
+/**
+ * Callback invoked by the transport implementation after a connect request has
+ * completed. If status is #0, then @stream will point to a valid handle, which
+ * user code is then responsible to uv_close() and then release.
+ */
+struct raft_uv_connect;
+typedef void (*raft_uv_connect_cb)(struct raft_uv_connect *req,
+				   struct uv_stream_s *stream,
+				   int status);
+
+/**
+ * Handle to a connect request.
+ */
+struct raft_uv_connect
+{
+	void *data;            /* User data */
+	raft_uv_connect_cb cb; /* Callback */
+};
+
+/**
+ * Callback invoked by the transport implementation after a close request is
+ * completed.
+ */
+typedef void (*raft_uv_transport_close_cb)(struct raft_uv_transport *t);
+
+/**
+ * Interface to establish outgoing connections to other Raft servers and to
+ * accept incoming connections from them.
+ */
+
+struct raft_uv_transport
+{
+	/**
+	 * Keep track of struct version, MUST be filled out by user.
+	 * When moving to a new version, the user MUST implement the newly added
+	 * methods.
+	 * Latest version is 1.
+	 */
+	int version;
+
+	/**
+	 * User defined data.
+	 */
+	void *data;
+
+	/**
+	 * Implementation-defined state.
+	 */
+	void *impl;
+
+	/**
+	 * Human-readable message providing diagnostic information about the
+	 * last error occurred.
+	 */
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+
+	/**
+	 * Initialize the transport with the given server's identity.
+	 */
+	int (*init)(struct raft_uv_transport *t,
+		    raft_id id,
+		    const char *address);
+
+	/**
+	 * Start listening for incoming connections.
+	 *
+	 * Once a new connection is accepted, the @cb callback passed in the
+	 * initializer must be invoked with the relevant details of the
+	 * connecting Raft server.
+	 */
+	int (*listen)(struct raft_uv_transport *t, raft_uv_accept_cb cb);
+
+	/**
+	 * Connect to the server with the given ID and address.
+	 *
+	 * The @cb callback must be invoked when the connection has been
+	 * established or the connection attempt has failed. The memory pointed
+	 * by @req can be released only after @cb has fired.
+	 */
+	int (*connect)(struct raft_uv_transport *t,
+		       struct raft_uv_connect *req,
+		       raft_id id,
+		       const char *address,
+		       raft_uv_connect_cb cb);
+
+	/**
+	 * Close the transport.
+	 *
+	 * The implementation must:
+	 *
+	 * - Stop accepting incoming connections. The @cb callback passed to
+	 * @listen must not be invoked anymore.
+	 *
+	 * - Cancel all pending @connect requests.
+	 *
+	 * - Invoke the @cb callback passed to this method once it's safe to
+	 * release the memory of the transport object.
+	 */
+	void (*close)(struct raft_uv_transport *t,
+		      raft_uv_transport_close_cb cb);
+};
+
+/**
+ * Init a transport interface that uses TCP sockets.
+ */
+RAFT_API int raft_uv_tcp_init(struct raft_uv_transport *t,
+			      struct uv_loop_s *loop);
+
+/**
+ * Release any memory allocated internally.
+ */
+RAFT_API void raft_uv_tcp_close(struct raft_uv_transport *t);
+
+/**
+ * Set the IP address and port that the listening socket will bind to.
+ *
+ * By default the socket will bind to the address provided in
+ * raft_init(), which may be inconvenient if running your application in a
+ * container, for example.
+ *
+ * The @address argument must be an IPv4 dotted quad IP address and port, e.g.
+ * "0.0.0.0:8080". If you do not provide a port, the default of 8080 will be
+ * used. The port given here *must* match the port given to raft_init().
+ *
+ * Must be called before raft_init().
+ */
+RAFT_API int raft_uv_tcp_set_bind_address(struct raft_uv_transport *t,
+					  const char *address);
+
+/**
+ * Raft cluster test fixture, using an in-memory @raft_io implementation. This
+ * is meant to be used in unit tests.
+ */
+
+#define RAFT_FIXTURE_MAX_SERVERS 8
+
+/**
+ * Fixture step event types.
+ */
+enum {
+	RAFT_FIXTURE_TICK = 1, /* The tick callback has been invoked */
+	RAFT_FIXTURE_NETWORK,  /* A network request has been sent or received */
+	RAFT_FIXTURE_DISK,     /* An I/O request has been submitted */
+	RAFT_FIXTURE_WORK      /* A large, CPU and/or memory intensive task */
+};
+
+/**
+ * State of a single server in a cluster fixture.
+ */
+struct raft_fixture_server;
+
+/**
+ * Information about a test cluster event triggered by the fixture.
+ */
+struct raft_fixture_event;
+
+/**
+ * Returns the type of the event.
+ */
+int raft_fixture_event_type(struct raft_fixture_event *event);
+
+/**
+ * Returns the server index of the event.
+ */
+unsigned raft_fixture_event_server_index(struct raft_fixture_event *event);
+
+/**
+ * Event callback. See raft_fixture_hook().
+ */
+struct raft_fixture;
+typedef void (*raft_fixture_event_cb)(struct raft_fixture *f,
+				      struct raft_fixture_event *event);
+
+/**
+ * Test implementation of a cluster of @n servers, each having a user-provided
+ * FSM.
+ *
+ * The cluster can simulate network latency and time elapsed on individual
+ * servers.
+ *
+ * Servers can be alive or dead. Network messages sent to dead servers are
+ * dropped. Dead servers do not have their @raft_io_tick_cb callback invoked.
+ *
+ * Any two servers can be connected or disconnected. Network messages sent
+ * between disconnected servers are dropped.
+ */
+struct raft_fixture
+{
+	raft_time time;          /* Global time, common to all servers. */
+	unsigned n;              /* Number of servers. */
+	raft_id leader_id;       /* ID of current leader, or 0 if none. */
+	struct raft_log *log;    /* Copy of current leader's log. */
+	raft_index commit_index; /* Current commit index on leader. */
+	struct raft_fixture_event *event; /* Last event occurred. */
+	raft_fixture_event_cb hook;       /* Event callback. */
+	struct raft_fixture_server *servers[RAFT_FIXTURE_MAX_SERVERS];
+	uint64_t reserved[16]; /* For future expansion of struct. */
+};
+
+/**
+ * Initialize a raft cluster fixture. Servers can be added by using
+ * `raft_fixture_grow`.
+ */
+RAFT_API int raft_fixture_init(struct raft_fixture *f);
+
+/**
+ * Release all memory used by the fixture.
+ */
+RAFT_API void raft_fixture_close(struct raft_fixture *f);
+
+/**
+ * Convenience to generate a configuration object containing all servers in the
+ * cluster. The first @n_voting servers will be voting ones.
+ */
+RAFT_API int raft_fixture_configuration(struct raft_fixture *f,
+					unsigned n_voting,
+					struct raft_configuration *conf);
+
+/**
+ * Convenience to bootstrap all servers in the cluster using the given
+ * configuration.
+ */
+RAFT_API int raft_fixture_bootstrap(struct raft_fixture *f,
+				    struct raft_configuration *conf);
+
+/**
+ * Convenience to start all servers in the fixture.
+ */
+RAFT_API int raft_fixture_start(struct raft_fixture *f);
+
+/**
+ * Return the number of servers in the fixture.
+ */
+RAFT_API unsigned raft_fixture_n(struct raft_fixture *f);
+
+/**
+ * Return the current cluster global time. All raft instances see the same time.
+ */
+RAFT_API raft_time raft_fixture_time(struct raft_fixture *f);
+
+/**
+ * Return the raft instance associated with the @i'th server of the fixture.
+ */
+RAFT_API struct raft *raft_fixture_get(struct raft_fixture *f, unsigned i);
+
+/**
+ * Return @true if the @i'th server hasn't been killed.
+ */
+RAFT_API bool raft_fixture_alive(struct raft_fixture *f, unsigned i);
+
+/**
+ * Return the index of the current leader, or the current number of servers if
+ * there's no leader.
+ */
+RAFT_API unsigned raft_fixture_leader_index(struct raft_fixture *f);
+
+/**
+ * Return the ID of the server the @i'th server has voted for, or zero .
+ */
+RAFT_API raft_id raft_fixture_voted_for(struct raft_fixture *f, unsigned i);
+
+/**
+ * Drive the cluster so the @i'th server starts an election but doesn't
+ * necessarily win it.
+ *
+ * This is achieved by bumping the randomized election timeout of all other
+ * servers to a very high value, letting the one of the @i'th server expire.
+ *
+ * There must currently be no leader and no candidate and the given server must
+ * be a voting one. Also, the @i'th server must be connected to a majority of
+ * voting servers.
+ */
+RAFT_API void raft_fixture_start_elect(struct raft_fixture *f, unsigned i);
+
+/**
+ * Calls raft_fixture_start_elect, but waits and asserts that the @i'th server
+ * has become the leader.
+ */
+RAFT_API void raft_fixture_elect(struct raft_fixture *f, unsigned i);
+
+/**
+ * Drive the cluster so the current leader gets deposed.
+ *
+ * This is achieved by dropping all AppendEntries result messages sent by
+ * followers to the leader, until the leader decides to step down because it has
+ * lost connectivity to a majority of followers.
+ */
+RAFT_API void raft_fixture_depose(struct raft_fixture *f);
+
+/**
+ * Step through the cluster state advancing the time to the minimum value needed
+ * for it to make progress (i.e. for a message to be delivered, for an I/O
+ * operation to complete or for a single time tick to occur).
+ *
+ * In particular, the following happens:
+ *
+ * 1. If there are pending #raft_io_send requests, that have been submitted
+ *    using #raft_io->send() and not yet sent, the oldest one is picked and the
+ *    relevant callback fired. This simulates completion of a socket write,
+ *    which means that the send request has been completed. The receiver does
+ *    not immediately receives the message, as the message is propagating
+ *    through the network. However any memory associated with the #raft_io_send
+ *    request can be released (e.g. log entries). The in-memory I/O
+ *    implementation assigns a latency to each RPC message, which will get
+ *    delivered to the receiver only after that amount of time elapses. If the
+ *    sender and the receiver are currently disconnected, the RPC message is
+ *    simply dropped. If a callback was fired, jump directly to 3. and skip 2.
+ *
+ * 2. All pending #raft_io_append disk writes across all servers, that have been
+ *    submitted using #raft_io->append() but not yet completed, are scanned and
+ *    the one with the lowest completion time is picked. All in-flight network
+ *    messages waiting to be delivered are scanned and the one with the lowest
+ *    delivery time is picked. All servers are scanned, and the one with the
+ *    lowest tick expiration time is picked. The three times are compared and
+ *    the lowest one is picked. If a #raft_io_append disk write has completed,
+ *    the relevant callback will be invoked, if there's a network message to be
+ *    delivered, the receiver's @raft_io_recv_cb callback gets fired, if a tick
+ *    timer has expired the relevant #raft_io->tick() callback will be
+ *    invoked. Only one event will be fired. If there is more than one event to
+ *    fire, one of them is picked according to the following rules: events for
+ *    servers with lower index are fired first, tick events take precedence over
+ *    disk events, and disk events take precedence over network events.
+ *
+ * 3. The current cluster leader is detected (if any). When detecting the leader
+ *    the Election Safety property is checked: no servers can be in leader state
+ *    for the same term. The server in leader state with the highest term is
+ *    considered the current cluster leader, as long as it's "stable", i.e. it
+ *    has been acknowledged by all servers connected to it, and those servers
+ *    form a majority (this means that no further leader change can happen,
+ *    unless the network gets disrupted). If there is a stable leader and it has
+ *    not changed with respect to the previous call to @raft_fixture_step(),
+ *    then the Leader Append-Only property is checked, by comparing its log with
+ *    a copy of it that was taken during the previous iteration.
+ *
+ * 4. If there is a stable leader, its current log is copied, in order to be
+ *    able to check the Leader Append-Only property at the next call.
+ *
+ * 5. If there is a stable leader, its commit index gets copied.
+ *
+ * The function returns information about which particular event occurred
+ * (either in step 1 or 2).
+ */
+RAFT_API struct raft_fixture_event *raft_fixture_step(struct raft_fixture *f);
+
+/**
+ * Call raft_fixture_step() exactly @n times, and return the last event fired.
+ */
+RAFT_API struct raft_fixture_event *raft_fixture_step_n(struct raft_fixture *f,
+							unsigned n);
+
+/**
+ * Step the cluster until the given @stop function returns #true, or @max_msecs
+ * have elapsed.
+ *
+ * Return #true if the @stop function has returned #true within @max_msecs.
+ */
+RAFT_API bool raft_fixture_step_until(struct raft_fixture *f,
+				      bool (*stop)(struct raft_fixture *f,
+						   void *arg),
+				      void *arg,
+				      unsigned max_msecs);
+
+/**
+ * Step the cluster until @msecs have elapsed.
+ */
+RAFT_API void raft_fixture_step_until_elapsed(struct raft_fixture *f,
+					      unsigned msecs);
+
+/**
+ * Step the cluster until a leader is elected, or @max_msecs have elapsed.
+ */
+RAFT_API bool raft_fixture_step_until_has_leader(struct raft_fixture *f,
+						 unsigned max_msecs);
+
+/**
+ * Step the cluster until the current leader gets deposed, or @max_msecs have
+ * elapsed.
+ */
+RAFT_API bool raft_fixture_step_until_has_no_leader(struct raft_fixture *f,
+						    unsigned max_msecs);
+
+/**
+ * Step the cluster until the @i'th server has applied the entry at the given
+ * index, or @max_msecs have elapsed. If @i equals the number of servers, then
+ * step until all servers have applied the given entry.
+ */
+RAFT_API bool raft_fixture_step_until_applied(struct raft_fixture *f,
+					      unsigned i,
+					      raft_index index,
+					      unsigned max_msecs);
+
+/**
+ * Step the cluster until the state of the @i'th server matches the given one,
+ * or @max_msecs have elapsed.
+ */
+RAFT_API bool raft_fixture_step_until_state_is(struct raft_fixture *f,
+					       unsigned i,
+					       int state,
+					       unsigned max_msecs);
+
+/**
+ * Step the cluster until the term of the @i'th server matches the given one,
+ * or @max_msecs have elapsed.
+ */
+RAFT_API bool raft_fixture_step_until_term_is(struct raft_fixture *f,
+					      unsigned i,
+					      raft_term term,
+					      unsigned max_msecs);
+
+/**
+ * Step the cluster until the @i'th server has voted for the @j'th one, or
+ * @max_msecs have elapsed.
+ */
+RAFT_API bool raft_fixture_step_until_voted_for(struct raft_fixture *f,
+						unsigned i,
+						unsigned j,
+						unsigned max_msecs);
+
+/**
+ * Step the cluster until all pending network messages from the @i'th server to
+ * the @j'th server have been delivered, or @max_msecs have elapsed.
+ */
+RAFT_API bool raft_fixture_step_until_delivered(struct raft_fixture *f,
+						unsigned i,
+						unsigned j,
+						unsigned max_msecs);
+
+/**
+ * Set a function to be called after every time a fixture event occurs as
+ * consequence of a step.
+ */
+RAFT_API void raft_fixture_hook(struct raft_fixture *f,
+				raft_fixture_event_cb hook);
+
+/**
+ * Disconnect the @i'th and the @j'th servers, so attempts to send a message
+ * from @i to @j will fail with #RAFT_NOCONNECTION.
+ */
+RAFT_API void raft_fixture_disconnect(struct raft_fixture *f,
+				      unsigned i,
+				      unsigned j);
+
+/**
+ * Reconnect the @i'th and the @j'th servers, so attempts to send a message
+ * from @i to @j will succeed again.
+ */
+RAFT_API void raft_fixture_reconnect(struct raft_fixture *f,
+				     unsigned i,
+				     unsigned j);
+
+/**
+ * Saturate the connection between the @i'th and the @j'th servers, so messages
+ * sent by @i to @j will be silently dropped.
+ */
+RAFT_API void raft_fixture_saturate(struct raft_fixture *f,
+				    unsigned i,
+				    unsigned j);
+
+/**
+ * Return true if the connection from the @i'th to the @j'th server has been set
+ * as saturated.
+ */
+RAFT_API bool raft_fixture_saturated(struct raft_fixture *f,
+				     unsigned i,
+				     unsigned j);
+
+/**
+ * Desaturate the connection between the @i'th and the @j'th servers, so
+ * messages sent by @i to @j will start being delivered again.
+ */
+RAFT_API void raft_fixture_desaturate(struct raft_fixture *f,
+				      unsigned i,
+				      unsigned j);
+
+/**
+ * Kill the server with the given index. The server won't receive any message
+ * and its tick callback won't be invoked.
+ */
+RAFT_API void raft_fixture_kill(struct raft_fixture *f, unsigned i);
+
+/**
+ * Revive a killed server with the given index.
+ */
+RAFT_API void raft_fixture_revive(struct raft_fixture *f, unsigned i);
+
+/**
+ * Add a new empty server to the cluster and connect it to all others.
+ */
+RAFT_API int raft_fixture_grow(struct raft_fixture *f, struct raft_fsm *fsm);
+
+/**
+ * Set the value that will be returned to the @i'th raft instance when it asks
+ * the underlying #raft_io implementation for a randomized election timeout
+ * value. The default value is 1000 + @i * 100, meaning that the election timer
+ * of server 0 will expire first.
+ */
+RAFT_API void raft_fixture_set_randomized_election_timeout(
+    struct raft_fixture *f,
+    unsigned i,
+    unsigned msecs);
+
+/**
+ * Set the network latency in milliseconds. Each RPC message sent by the @i'th
+ * server from now on will take @msecs milliseconds to be delivered. The default
+ * value is 15.
+ */
+RAFT_API void raft_fixture_set_network_latency(struct raft_fixture *f,
+					       unsigned i,
+					       unsigned msecs);
+
+/**
+ * Set the disk I/O latency in milliseconds. Each append request will take this
+ * amount of milliseconds to complete. The default value is 10.
+ */
+RAFT_API void raft_fixture_set_disk_latency(struct raft_fixture *f,
+					    unsigned i,
+					    unsigned msecs);
+
+/**
+ * Send the send latency in milliseconds. Each message send will take this many
+ * milliseconds before the send callback is invoked.
+ */
+RAFT_API void raft_fixture_set_send_latency(struct raft_fixture *f,
+					    unsigned i,
+					    unsigned j,
+					    unsigned msecs);
+
+/**
+ * Set the persisted term of the @i'th server.
+ */
+RAFT_API void raft_fixture_set_term(struct raft_fixture *f,
+				    unsigned i,
+				    raft_term term);
+
+/**
+ * Set the most recent persisted snapshot on the @i'th server.
+ */
+RAFT_API void raft_fixture_set_snapshot(struct raft_fixture *f,
+					unsigned i,
+					struct raft_snapshot *snapshot);
+
+/**
+ * Add an entry to the persisted entries of the @i'th server.
+ */
+RAFT_API void raft_fixture_add_entry(struct raft_fixture *f,
+				     unsigned i,
+				     struct raft_entry *entry);
+
+RAFT_API void raft_fixture_append_fault(struct raft_fixture *f,
+					unsigned i,
+					int delay);
+
+RAFT_API void raft_fixture_vote_fault(struct raft_fixture *f,
+				      unsigned i,
+				      int delay);
+
+RAFT_API void raft_fixture_term_fault(struct raft_fixture *f,
+				      unsigned i,
+				      int delay);
+
+RAFT_API void raft_fixture_send_fault(struct raft_fixture *f,
+				      unsigned i,
+				      int delay);
+
+/**
+ * Return the number of messages of the given type that the @i'th server has
+ * successfully sent so far.
+ */
+RAFT_API unsigned raft_fixture_n_send(struct raft_fixture *f,
+				      unsigned i,
+				      int type);
+
+/**
+ * Return the number of messages of the given type that the @i'th server has
+ * received so far.
+ */
+RAFT_API unsigned raft_fixture_n_recv(struct raft_fixture *f,
+				      unsigned i,
+				      int type);
+
+/**
+ * Force the @i'th server into the UNAVAILABLE state.
+ */
+RAFT_API void raft_fixture_make_unavailable(struct raft_fixture *f, unsigned i);
+
+#endif /* RAFT_H */
diff --git a/src/raft/array.h b/src/raft/array.h
new file mode 100644
index 000000000..711135cc6
--- /dev/null
+++ b/src/raft/array.h
@@ -0,0 +1,25 @@
+/* Macros to manipulate contiguous arrays. */
+
+#ifndef ARRAY_H_
+#define ARRAY_H_
+
+#include "../raft.h"
+
+/* Append item I of type T to array A which currently has N items.
+ *
+ * A and N must both by pointers. Set RV to -1 in case of failure. */
+#define ARRAY__APPEND(T, I, A, N, RV)                                \
+	{                                                            \
+		T *tmp_array;                                        \
+		tmp_array = raft_realloc(*A, (*N + 1) * sizeof **A); \
+		if (tmp_array != NULL) {                             \
+			(*N)++;                                      \
+			*A = tmp_array;                              \
+			(*A)[(*N) - 1] = I;                          \
+			RV = 0;                                      \
+		} else {                                             \
+			RV = -1;                                     \
+		}                                                    \
+	}
+
+#endif /* ARRAY_H_ */
diff --git a/src/raft/assert.h b/src/raft/assert.h
new file mode 100644
index 000000000..3bb77d1ce
--- /dev/null
+++ b/src/raft/assert.h
@@ -0,0 +1,41 @@
+/* Define the assert() macro, either as the standard one or the test one. */
+
+#ifndef ASSERT_H_
+#define ASSERT_H_
+
+#if defined(RAFT_TEST)
+extern void munit_errorf_ex(const char *filename,
+			    int line,
+			    const char *format,
+			    ...);
+#define assert(expr)                                                  \
+	do {                                                          \
+		if (!expr) {                                          \
+			munit_errorf_ex(__FILE__, __LINE__,           \
+					"assertion failed: ", #expr); \
+		}                                                     \
+	} while (0)
+#elif defined(NDEBUG)
+#define assert(x)                \
+	do {                     \
+		(void)sizeof(x); \
+	} while (0)
+#elif defined(RAFT_ASSERT_WITH_BACKTRACE)
+#include <assert.h> /* for __assert_fail */
+#include <backtrace.h>
+#include <stdio.h>
+#undef assert
+#define assert(x)                                                             \
+	do {                                                                  \
+		struct backtrace_state *state_;                               \
+		if (!(x)) {                                                   \
+			state_ = backtrace_create_state(NULL, 0, NULL, NULL); \
+			backtrace_print(state_, 0, stderr);                   \
+			__assert_fail(#x, __FILE__, __LINE__, __func__);      \
+		}                                                             \
+	} while (0)
+#else
+#include <assert.h>
+#endif
+
+#endif /* ASSERT_H_ */
diff --git a/src/raft/byte.c b/src/raft/byte.c
new file mode 100644
index 000000000..3fcd79ee8
--- /dev/null
+++ b/src/raft/byte.c
@@ -0,0 +1,374 @@
+#include "byte.h"
+
+/* Taken from https://github.com/gcc-mirror/gcc/blob/master/libiberty/crc32.c */
+static const unsigned byteCrcTable[] = {
+    0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b,
+    0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
+    0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd, 0x4c11db70, 0x48d0c6c7,
+    0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
+    0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3,
+    0x709f7b7a, 0x745e66cd, 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
+    0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58, 0xbaea46ef,
+    0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
+    0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, 0xc3f706fb,
+    0xceb42022, 0xca753d95, 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
+    0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0,
+    0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
+    0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4,
+    0x0808d07d, 0x0cc9cdca, 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde,
+    0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08,
+    0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
+    0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc,
+    0xb6238b25, 0xb2e29692, 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6,
+    0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a, 0xe0b41de7, 0xe4750050,
+    0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
+    0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34,
+    0xdc3abded, 0xd8fba05a, 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
+    0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, 0x4f040d56, 0x4bc510e1,
+    0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
+    0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5,
+    0x3f9b762c, 0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
+    0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, 0xf12f560e, 0xf5ee4bb9,
+    0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
+    0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd,
+    0xcda1f604, 0xc960ebb3, 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7,
+    0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71,
+    0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
+    0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2,
+    0x470cdd2b, 0x43cdc09c, 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
+    0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e,
+    0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
+    0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a,
+    0x2d15ebe3, 0x29d4f654, 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0,
+    0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c, 0xe3a1cbc1, 0xe760d676,
+    0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
+    0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662,
+    0x933eb0bb, 0x97ffad0c, 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668,
+    0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4};
+
+unsigned byteCrc32(const void *buf, const size_t size, const unsigned init)
+{
+	unsigned crc = init;
+	uint8_t *cursor = (uint8_t *)buf;
+	size_t count = size;
+
+	while (count--) {
+		crc = (crc << 8) ^ byteCrcTable[((crc >> 24) ^ *cursor) & 255];
+		cursor++;
+	}
+	return crc;
+}
+
+/* ================ sha1.c ================ */
+/*
+SHA-1 in C
+By Steve Reid <steve@edmweb.com>
+100% Public Domain
+
+Test Vectors (from FIPS PUB 180-1)
+"abc"
+  A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
+"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+  84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
+A million repetitions of "a"
+  34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
+*/
+
+/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
+/* #define SHA1HANDSOFF * Copies data before messing with it. */
+
+#define SHA1HANDSOFF
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h> /* for u_int*_t */
+#if defined(__sun)
+#include "solarisfixes.h"
+#endif
+
+#ifndef BYTE_ORDER
+#if (BSD >= 199103)
+#include <machine/endian.h>
+#else
+#if defined(linux) || defined(__linux__)
+#include <endian.h>
+#else
+#define LITTLE_ENDIAN 1234 /* least-significant byte first (vax, pc) */
+#define BIG_ENDIAN 4321    /* most-significant byte first (IBM, net) */
+#define PDP_ENDIAN 3412    /* LSB first in word, MSW first in long (pdp)*/
+
+#if defined(vax) || defined(ns32000) || defined(sun386) ||      \
+    defined(__i386__) || defined(MIPSEL) || defined(_MIPSEL) || \
+    defined(BIT_ZERO_ON_RIGHT) || defined(__alpha__) || defined(__alpha)
+#define BYTE_ORDER LITTLE_ENDIAN
+#endif
+
+#if defined(sel) || defined(pyr) || defined(mc68000) || defined(sparc) ||      \
+    defined(is68k) || defined(tahoe) || defined(ibm032) || defined(ibm370) ||  \
+    defined(MIPSEB) || defined(_MIPSEB) || defined(_IBMR2) || defined(DGUX) || \
+    defined(apollo) || defined(__convex__) || defined(_CRAY) ||                \
+    defined(__hppa) || defined(__hp9000) || defined(__hp9000s300) ||           \
+    defined(__hp9000s700) || defined(BIT_ZERO_ON_LEFT) || defined(m68k) ||     \
+    defined(__sparc)
+#define BYTE_ORDER BIG_ENDIAN
+#endif
+#endif /* linux */
+#endif /* BSD */
+#endif /* BYTE_ORDER */
+
+#if defined(__BYTE_ORDER) && !defined(BYTE_ORDER)
+#if (__BYTE_ORDER == __LITTLE_ENDIAN)
+#define BYTE_ORDER LITTLE_ENDIAN
+#else
+#define BYTE_ORDER BIG_ENDIAN
+#endif
+#endif
+
+#if !defined(BYTE_ORDER) ||                                     \
+    (BYTE_ORDER != BIG_ENDIAN && BYTE_ORDER != LITTLE_ENDIAN && \
+     BYTE_ORDER != PDP_ENDIAN)
+/* you must determine what the correct bit order is for
+ * your compiler - the next line is an intentional error
+ * which will force your compiles to bomb until you fix
+ * the above macros.
+ */
+#error "Undefined or invalid BYTE_ORDER"
+#endif
+
+#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
+/* blk0() and blk() perform the initial expand. */
+/* I got the idea of expanding during the round function from SSLeay */
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define blk0(i)                                              \
+	(block->l[i] = (rol(block->l[i], 24) & 0xFF00FF00) | \
+		       (rol(block->l[i], 8) & 0x00FF00FF))
+#elif BYTE_ORDER == BIG_ENDIAN
+#define blk0(i) block->l[i]
+#else
+#error "Endianness not defined!"
+#endif
+#define blk(i)                                                      \
+	(block->l[i & 15] =                                         \
+	     rol(block->l[(i + 13) & 15] ^ block->l[(i + 8) & 15] ^ \
+		     block->l[(i + 2) & 15] ^ block->l[i & 15],     \
+		 1))
+
+/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
+#define R0(v, w, x, y, z, i)                                         \
+	z += ((w & (x ^ y)) ^ y) + blk0(i) + 0x5A827999 + rol(v, 5); \
+	w = rol(w, 30);
+#define R1(v, w, x, y, z, i)                                        \
+	z += ((w & (x ^ y)) ^ y) + blk(i) + 0x5A827999 + rol(v, 5); \
+	w = rol(w, 30);
+#define R2(v, w, x, y, z, i)                                \
+	z += (w ^ x ^ y) + blk(i) + 0x6ED9EBA1 + rol(v, 5); \
+	w = rol(w, 30);
+#define R3(v, w, x, y, z, i)                                              \
+	z += (((w | x) & y) | (w & x)) + blk(i) + 0x8F1BBCDC + rol(v, 5); \
+	w = rol(w, 30);
+#define R4(v, w, x, y, z, i)                                \
+	z += (w ^ x ^ y) + blk(i) + 0xCA62C1D6 + rol(v, 5); \
+	w = rol(w, 30);
+
+static void byteSha1Transform(uint32_t state[5], const uint8_t buffer[64])
+{
+	uint32_t a, b, c, d, e;
+	typedef union {
+		uint8_t c[64];
+		uint32_t l[16];
+	} CHAR64LONG16;
+#ifdef SHA1HANDSOFF
+	CHAR64LONG16 block[1]; /* use array to appear as a pointer */
+	memcpy(block, buffer, 64);
+#else
+	/* The following had better never be used because it causes the
+	 * pointer-to-const buffer to be cast into a pointer to non-const.
+	 * And the result is written through.  I threw a "const" in, hoping
+	 * this will cause a diagnostic.
+	 */
+	CHAR64LONG16 *block = (const CHAR64LONG16 *)buffer;
+#endif
+	/* Copy context->state[] to working vars */
+	a = state[0];
+	b = state[1];
+	c = state[2];
+	d = state[3];
+	e = state[4];
+	/* 4 rounds of 20 operations each. Loop unrolled. */
+	R0(a, b, c, d, e, 0);
+	R0(e, a, b, c, d, 1);
+	R0(d, e, a, b, c, 2);
+	R0(c, d, e, a, b, 3);
+	R0(b, c, d, e, a, 4);
+	R0(a, b, c, d, e, 5);
+	R0(e, a, b, c, d, 6);
+	R0(d, e, a, b, c, 7);
+	R0(c, d, e, a, b, 8);
+	R0(b, c, d, e, a, 9);
+	R0(a, b, c, d, e, 10);
+	R0(e, a, b, c, d, 11);
+	R0(d, e, a, b, c, 12);
+	R0(c, d, e, a, b, 13);
+	R0(b, c, d, e, a, 14);
+	R0(a, b, c, d, e, 15);
+	R1(e, a, b, c, d, 16);
+	R1(d, e, a, b, c, 17);
+	R1(c, d, e, a, b, 18);
+	R1(b, c, d, e, a, 19);
+	R2(a, b, c, d, e, 20);
+	R2(e, a, b, c, d, 21);
+	R2(d, e, a, b, c, 22);
+	R2(c, d, e, a, b, 23);
+	R2(b, c, d, e, a, 24);
+	R2(a, b, c, d, e, 25);
+	R2(e, a, b, c, d, 26);
+	R2(d, e, a, b, c, 27);
+	R2(c, d, e, a, b, 28);
+	R2(b, c, d, e, a, 29);
+	R2(a, b, c, d, e, 30);
+	R2(e, a, b, c, d, 31);
+	R2(d, e, a, b, c, 32);
+	R2(c, d, e, a, b, 33);
+	R2(b, c, d, e, a, 34);
+	R2(a, b, c, d, e, 35);
+	R2(e, a, b, c, d, 36);
+	R2(d, e, a, b, c, 37);
+	R2(c, d, e, a, b, 38);
+	R2(b, c, d, e, a, 39);
+	R3(a, b, c, d, e, 40);
+	R3(e, a, b, c, d, 41);
+	R3(d, e, a, b, c, 42);
+	R3(c, d, e, a, b, 43);
+	R3(b, c, d, e, a, 44);
+	R3(a, b, c, d, e, 45);
+	R3(e, a, b, c, d, 46);
+	R3(d, e, a, b, c, 47);
+	R3(c, d, e, a, b, 48);
+	R3(b, c, d, e, a, 49);
+	R3(a, b, c, d, e, 50);
+	R3(e, a, b, c, d, 51);
+	R3(d, e, a, b, c, 52);
+	R3(c, d, e, a, b, 53);
+	R3(b, c, d, e, a, 54);
+	R3(a, b, c, d, e, 55);
+	R3(e, a, b, c, d, 56);
+	R3(d, e, a, b, c, 57);
+	R3(c, d, e, a, b, 58);
+	R3(b, c, d, e, a, 59);
+	R4(a, b, c, d, e, 60);
+	R4(e, a, b, c, d, 61);
+	R4(d, e, a, b, c, 62);
+	R4(c, d, e, a, b, 63);
+	R4(b, c, d, e, a, 64);
+	R4(a, b, c, d, e, 65);
+	R4(e, a, b, c, d, 66);
+	R4(d, e, a, b, c, 67);
+	R4(c, d, e, a, b, 68);
+	R4(b, c, d, e, a, 69);
+	R4(a, b, c, d, e, 70);
+	R4(e, a, b, c, d, 71);
+	R4(d, e, a, b, c, 72);
+	R4(c, d, e, a, b, 73);
+	R4(b, c, d, e, a, 74);
+	R4(a, b, c, d, e, 75);
+	R4(e, a, b, c, d, 76);
+	R4(d, e, a, b, c, 77);
+	R4(c, d, e, a, b, 78);
+	R4(b, c, d, e, a, 79);
+	/* Add the working vars back into context.state[] */
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	/* Wipe variables */
+	a = b = c = d = e = 0;
+#ifdef SHA1HANDSOFF
+	memset(block, '\0', sizeof(block));
+#endif
+}
+
+void byteSha1Init(struct byteSha1 *s)
+{
+	/* SHA1 initialization constants */
+	s->state[0] = 0x67452301;
+	s->state[1] = 0xEFCDAB89;
+	s->state[2] = 0x98BADCFE;
+	s->state[3] = 0x10325476;
+	s->state[4] = 0xC3D2E1F0;
+	s->count[0] = s->count[1] = 0;
+}
+
+/* Run your data through this. */
+void __attribute__((noinline))
+byteSha1Update(struct byteSha1 *s, const uint8_t *data, uint32_t len)
+{
+	uint32_t i;
+	uint32_t j;
+
+	j = s->count[0];
+	if ((s->count[0] += len << 3) < j)
+		s->count[1]++;
+	s->count[1] += (len >> 29);
+	j = (j >> 3) & 63;
+	if ((j + len) > 63) {
+		memcpy(&s->buffer[j], data, (i = 64 - j));
+		byteSha1Transform(s->state, s->buffer);
+		for (; i + 63 < len; i += 64) {
+			byteSha1Transform(s->state, &data[i]);
+		}
+		j = 0;
+	} else
+		i = 0;
+	memcpy(&s->buffer[j], &data[i], len - i);
+}
+
+/* Add padding and return the message digest. */
+
+void byteSha1Digest(struct byteSha1 *s, uint8_t value[20])
+{
+	unsigned i;
+	uint8_t finalcount[8];
+	uint8_t c;
+
+#if 0 /* untested "improvement" by DHR */
+    /* Convert context->count to a sequence of bytes
+     * in finalcount.  Second element first, but
+     * big-endian order within element.
+     * But we do it all backwards.
+     */
+    uint8_t *fcp = &finalcount[8];
+
+    for (i = 0; i < 2; i++)
+    {
+	u_int32_t t = context->count[i];
+	int j;
+
+	for (j = 0; j < 4; t >>= 8, j++)
+	    *--fcp = (uint8_t) t
+    }
+#else
+	for (i = 0; i < 8; i++) {
+		finalcount[i] = (uint8_t)((s->count[(i >= 4 ? 0 : 1)] >>
+					   ((3 - (i & 3)) * 8)) &
+					  255); /* Endian independent */
+	}
+#endif
+	c = 0200;
+	byteSha1Update(s, &c, 1);
+	while ((s->count[0] & 504) != 448) {
+		c = 0000;
+		byteSha1Update(s, &c, 1);
+	}
+	byteSha1Update(s, finalcount, 8); /* Should cause a SHA1Transform() */
+	for (i = 0; i < 20; i++) {
+		value[i] =
+		    (uint8_t)((s->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
+	}
+	/* Wipe variables */
+	memset(s, '\0', sizeof(*s));
+	memset(&finalcount, '\0', sizeof(finalcount));
+}
+
+/* ================ end of sha1.c ================ */
diff --git a/src/raft/byte.h b/src/raft/byte.h
new file mode 100644
index 000000000..ba213e914
--- /dev/null
+++ b/src/raft/byte.h
@@ -0,0 +1,182 @@
+/* Byte-level utilities. */
+
+#ifndef BYTE_H_
+#define BYTE_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#if defined(__cplusplus)
+#define BYTE__INLINE inline
+#else
+#if defined(__clang__)
+#define BYTE__INLINE static inline __attribute__((unused))
+#else
+#define BYTE__INLINE static inline
+#endif
+#endif
+
+/* Compile-time endianess detection (best effort). */
+#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
+    (defined(__ARMEL__) && (__ARMEL__ == 1))
+#define BYTE__LITTLE_ENDIAN
+#elif defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN) && \
+    defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8
+#define RAFT__BIG_ENDIAN
+#endif
+
+/* Flip a 32-bit number to network byte order (little endian) */
+BYTE__INLINE uint32_t byteFlip32(uint32_t v)
+{
+#if defined(BYTE__LITTLE_ENDIAN)
+	return v;
+#elif defined(RAFT__BIG_ENDIAN)
+	return __builtin_bswap32(v);
+#else /* Unknown endianess */
+	union {
+		uint32_t u;
+		uint8_t v[4];
+	} s;
+
+	s.v[0] = (uint8_t)v;
+	s.v[1] = (uint8_t)(v >> 8);
+	s.v[2] = (uint8_t)(v >> 16);
+	s.v[3] = (uint8_t)(v >> 24);
+
+	return s.u;
+#endif
+}
+
+/* Flip a 64-bit number to network byte order (little endian) */
+BYTE__INLINE uint64_t byteFlip64(uint64_t v)
+{
+#if defined(BYTE__LITTLE_ENDIAN)
+	return v;
+#elif defined(RAFT__BIG_ENDIAN)
+	return __builtin_bswap64(v);
+#else
+	union {
+		uint64_t u;
+		uint8_t v[8];
+	} s;
+
+	s.v[0] = (uint8_t)v;
+	s.v[1] = (uint8_t)(v >> 8);
+	s.v[2] = (uint8_t)(v >> 16);
+	s.v[3] = (uint8_t)(v >> 24);
+	s.v[4] = (uint8_t)(v >> 32);
+	s.v[5] = (uint8_t)(v >> 40);
+	s.v[6] = (uint8_t)(v >> 48);
+	s.v[7] = (uint8_t)(v >> 56);
+
+	return s.u;
+#endif
+}
+
+BYTE__INLINE void bytePut8(void **cursor, uint8_t value)
+{
+	uint8_t **p = (uint8_t **)cursor;
+	**p = value;
+	*p += 1;
+}
+
+BYTE__INLINE void bytePut32(void **cursor, uint32_t value)
+{
+	unsigned i;
+	uint32_t flipped = byteFlip32(value);
+	for (i = 0; i < sizeof(uint32_t); i++) {
+		bytePut8(cursor, ((uint8_t *)(&flipped))[i]);
+	}
+}
+
+BYTE__INLINE void bytePut64(void **cursor, uint64_t value)
+{
+	unsigned i;
+	uint64_t flipped = byteFlip64(value);
+	for (i = 0; i < sizeof(uint64_t); i++) {
+		bytePut8(cursor, ((uint8_t *)(&flipped))[i]);
+	}
+}
+
+BYTE__INLINE void bytePutString(void **cursor, const char *value)
+{
+	char **p = (char **)cursor;
+	strcpy(*p, value);
+	*p += strlen(value) + 1;
+}
+
+BYTE__INLINE uint8_t byteGet8(const void **cursor)
+{
+	const uint8_t **p = (const uint8_t **)cursor;
+	uint8_t value = **p;
+	*p += 1;
+	return value;
+}
+
+BYTE__INLINE uint32_t byteGet32(const void **cursor)
+{
+	uint32_t value = 0;
+	unsigned i;
+	for (i = 0; i < sizeof(uint32_t); i++) {
+		((uint8_t *)(&value))[i] = byteGet8(cursor);
+	}
+	return byteFlip32(value);
+}
+
+BYTE__INLINE uint64_t byteGet64(const void **cursor)
+{
+	uint64_t value = 0;
+	unsigned i;
+	for (i = 0; i < sizeof(uint64_t); i++) {
+		((uint8_t *)(&value))[i] = byteGet8(cursor);
+	}
+	return byteFlip64(value);
+}
+
+BYTE__INLINE const char *byteGetString(const void **cursor, size_t max_len)
+{
+	const char **p = (const char **)cursor;
+	const char *value = *p;
+	size_t len = 0;
+	while (len < max_len) {
+		if (*(*p + len) == 0) {
+			break;
+		}
+		len++;
+	}
+	if (len == max_len) {
+		return NULL;
+	}
+	*p += len + 1;
+	return value;
+}
+
+/* Add padding to size if it's not a multiple of 8. */
+BYTE__INLINE size_t bytePad64(size_t size)
+{
+	size_t rest = size % sizeof(uint64_t);
+
+	if (rest != 0) {
+		size += sizeof(uint64_t) - rest;
+	}
+
+	return size;
+}
+
+/* Calculate the CRC32 checksum of the given data buffer. */
+unsigned byteCrc32(const void *buf, size_t size, unsigned init);
+
+struct byteSha1
+{
+	uint32_t state[5];
+	uint32_t count[2];
+	uint8_t buffer[64];
+	uint8_t value[20];
+};
+
+void byteSha1Init(struct byteSha1 *s);
+void byteSha1Update(struct byteSha1 *s, const uint8_t *data, uint32_t len);
+void byteSha1Digest(struct byteSha1 *s, uint8_t value[20]);
+
+#endif /* BYTE_H_ */
diff --git a/src/raft/callbacks.c b/src/raft/callbacks.c
new file mode 100644
index 000000000..5f58ee21a
--- /dev/null
+++ b/src/raft/callbacks.c
@@ -0,0 +1,24 @@
+#include "callbacks.h"
+#include "heap.h"
+
+int raftInitCallbacks(struct raft *r)
+{
+	r->callbacks = 0;
+	struct raft_callbacks *cbs = RaftHeapCalloc(1, sizeof(*cbs));
+	if (cbs == NULL) {
+		return RAFT_NOMEM;
+	}
+	r->callbacks = (uint64_t)(uintptr_t)cbs;
+	return 0;
+}
+
+void raftDestroyCallbacks(struct raft *r)
+{
+	RaftHeapFree((void *)(uintptr_t)r->callbacks);
+	r->callbacks = 0;
+}
+
+struct raft_callbacks *raftGetCallbacks(struct raft *r)
+{
+	return (void *)(uintptr_t)r->callbacks;
+}
diff --git a/src/raft/callbacks.h b/src/raft/callbacks.h
new file mode 100644
index 000000000..e756b3070
--- /dev/null
+++ b/src/raft/callbacks.h
@@ -0,0 +1,15 @@
+#ifndef CALLBACKS_H_
+#define CALLBACKS_H_
+
+#include "../raft.h"
+
+struct raft_callbacks
+{
+	raft_state_cb state_cb;
+};
+
+int raftInitCallbacks(struct raft *r);
+void raftDestroyCallbacks(struct raft *r);
+struct raft_callbacks *raftGetCallbacks(struct raft *r);
+
+#endif
diff --git a/src/raft/client.c b/src/raft/client.c
new file mode 100644
index 000000000..cd88f1d2b
--- /dev/null
+++ b/src/raft/client.c
@@ -0,0 +1,455 @@
+#include "../raft.h"
+#include "../tracing.h"
+#include "assert.h"
+#include "configuration.h"
+#include "err.h"
+#include "lifecycle.h"
+#include "log.h"
+#include "membership.h"
+#include "progress.h"
+#include "queue.h"
+#include "replication.h"
+#include "request.h"
+
+int raft_apply(struct raft *r,
+	       struct raft_apply *req,
+	       const struct raft_buffer bufs[],
+	       const unsigned n,
+	       raft_apply_cb cb)
+{
+	raft_index index;
+	int rv;
+
+	tracef("raft_apply n %d", n);
+
+	assert(r != NULL);
+	assert(bufs != NULL);
+	assert(n > 0);
+
+	if (r->state != RAFT_LEADER || r->transfer != NULL) {
+		rv = RAFT_NOTLEADER;
+		ErrMsgFromCode(r->errmsg, rv);
+		tracef("raft_apply not leader");
+		goto err;
+	}
+
+	/* Index of the first entry being appended. */
+	index = logLastIndex(r->log) + 1;
+	tracef("%u commands starting at %lld", n, index);
+	req->type = RAFT_COMMAND;
+	req->index = index;
+	req->cb = cb;
+
+	/* Append the new entries to the log. */
+	rv = logAppendCommands(r->log, r->current_term, bufs, n);
+	if (rv != 0) {
+		goto err;
+	}
+
+	lifecycleRequestStart(r, (struct request *)req);
+
+	rv = replicationTrigger(r, index);
+	if (rv != 0) {
+		goto err_after_log_append;
+	}
+
+	return 0;
+
+err_after_log_append:
+	logDiscard(r->log, index);
+	QUEUE_REMOVE(&req->queue);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+int raft_barrier(struct raft *r, struct raft_barrier *req, raft_barrier_cb cb)
+{
+	raft_index index;
+	struct raft_buffer buf;
+	int rv;
+
+	if (r->state != RAFT_LEADER || r->transfer != NULL) {
+		rv = RAFT_NOTLEADER;
+		goto err;
+	}
+
+	/* TODO: use a completely empty buffer */
+	buf.len = 8;
+	buf.base = raft_malloc(buf.len);
+
+	if (buf.base == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	/* Index of the barrier entry being appended. */
+	index = logLastIndex(r->log) + 1;
+	tracef("barrier starting at %lld", index);
+	req->type = RAFT_BARRIER;
+	req->index = index;
+	req->cb = cb;
+
+	rv = logAppend(r->log, r->current_term, RAFT_BARRIER, &buf, NULL);
+	if (rv != 0) {
+		goto err_after_buf_alloc;
+	}
+
+	lifecycleRequestStart(r, (struct request *)req);
+
+	rv = replicationTrigger(r, index);
+	if (rv != 0) {
+		goto err_after_log_append;
+	}
+
+	return 0;
+
+err_after_log_append:
+	logDiscard(r->log, index);
+	QUEUE_REMOVE(&req->queue);
+err_after_buf_alloc:
+	raft_free(buf.base);
+err:
+	return rv;
+}
+
+static int clientChangeConfiguration(
+    struct raft *r,
+    struct raft_change *req,
+    const struct raft_configuration *configuration)
+{
+	raft_index index;
+	raft_term term = r->current_term;
+	int rv;
+
+	(void)req;
+
+	/* Index of the entry being appended. */
+	index = logLastIndex(r->log) + 1;
+
+	/* Encode the new configuration and append it to the log. */
+	rv = logAppendConfiguration(r->log, term, configuration);
+	if (rv != 0) {
+		goto err;
+	}
+
+	if (configuration->n != r->configuration.n) {
+		rv = progressRebuildArray(r, configuration);
+		if (rv != 0) {
+			goto err;
+		}
+	}
+
+	/* Update the current configuration if we've created a new object. */
+	if (configuration != &r->configuration) {
+		raft_configuration_close(&r->configuration);
+		r->configuration = *configuration;
+	}
+
+	/* Start writing the new log entry to disk and send it to the followers.
+	 */
+	rv = replicationTrigger(r, index);
+	if (rv != 0) {
+		/* TODO: restore the old next/match indexes and configuration.
+		 */
+		goto err_after_log_append;
+	}
+
+	r->configuration_uncommitted_index = index;
+
+	return 0;
+
+err_after_log_append:
+	logTruncate(r->log, index);
+
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+int raft_add(struct raft *r,
+	     struct raft_change *req,
+	     raft_id id,
+	     const char *address,
+	     raft_change_cb cb)
+{
+	struct raft_configuration configuration;
+	int rv;
+
+	rv = membershipCanChangeConfiguration(r);
+	if (rv != 0) {
+		return rv;
+	}
+
+	tracef("add server: id %llu, address %s", id, address);
+
+	/* Make a copy of the current configuration, and add the new server to
+	 * it. */
+	rv = configurationCopy(&r->configuration, &configuration);
+	if (rv != 0) {
+		goto err;
+	}
+
+	rv = raft_configuration_add(&configuration, id, address, RAFT_SPARE);
+	if (rv != 0) {
+		goto err_after_configuration_copy;
+	}
+
+	req->cb = cb;
+
+	rv = clientChangeConfiguration(r, req, &configuration);
+	if (rv != 0) {
+		goto err_after_configuration_copy;
+	}
+
+	assert(r->leader_state.change == NULL);
+	r->leader_state.change = req;
+
+	return 0;
+
+err_after_configuration_copy:
+	raft_configuration_close(&configuration);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+int raft_assign(struct raft *r,
+		struct raft_change *req,
+		raft_id id,
+		int role,
+		raft_change_cb cb)
+{
+	const struct raft_server *server;
+	unsigned server_index;
+	raft_index last_index;
+	int rv;
+
+	tracef("raft_assign to id:%llu the role:%d", id, role);
+	if (role != RAFT_STANDBY && role != RAFT_VOTER && role != RAFT_SPARE) {
+		rv = RAFT_BADROLE;
+		ErrMsgFromCode(r->errmsg, rv);
+		return rv;
+	}
+
+	rv = membershipCanChangeConfiguration(r);
+	if (rv != 0) {
+		return rv;
+	}
+
+	server = configurationGet(&r->configuration, id);
+	if (server == NULL) {
+		rv = RAFT_NOTFOUND;
+		ErrMsgPrintf(r->errmsg, "no server has ID %llu", id);
+		goto err;
+	}
+
+	/* Check if we have already the desired role. */
+	if (server->role == role) {
+		const char *name;
+		rv = RAFT_BADROLE;
+		switch (role) {
+			case RAFT_VOTER:
+				name = "voter";
+				break;
+			case RAFT_STANDBY:
+				name = "stand-by";
+				break;
+			case RAFT_SPARE:
+				name = "spare";
+				break;
+			default:
+				name = NULL;
+				assert(0);
+				break;
+		}
+		ErrMsgPrintf(r->errmsg, "server is already %s", name);
+		goto err;
+	}
+
+	server_index = configurationIndexOf(&r->configuration, id);
+	assert(server_index < r->configuration.n);
+
+	last_index = logLastIndex(r->log);
+
+	req->cb = cb;
+
+	assert(r->leader_state.change == NULL);
+	r->leader_state.change = req;
+
+	/* If we are not promoting to the voter role or if the log of this
+	 * server is already up-to-date, we can submit the configuration change
+	 * immediately. */
+	if (role != RAFT_VOTER ||
+	    progressMatchIndex(r, server_index) == last_index) {
+		int old_role = r->configuration.servers[server_index].role;
+		r->configuration.servers[server_index].role = role;
+
+		rv = clientChangeConfiguration(r, req, &r->configuration);
+		if (rv != 0) {
+			tracef("clientChangeConfiguration failed %d", rv);
+			r->configuration.servers[server_index].role = old_role;
+			return rv;
+		}
+
+		return 0;
+	}
+
+	r->leader_state.promotee_id = server->id;
+
+	/* Initialize the first catch-up round. */
+	r->leader_state.round_number = 1;
+	r->leader_state.round_index = last_index;
+	r->leader_state.round_start = r->io->time(r->io);
+
+	/* Immediately initiate an AppendEntries request. */
+	rv = replicationProgress(r, server_index);
+	if (rv != 0 && rv != RAFT_NOCONNECTION) {
+		/* This error is not fatal. */
+		tracef("failed to send append entries to server %llu: %s (%d)",
+		       server->id, raft_strerror(rv), rv);
+	}
+
+	return 0;
+
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+int raft_remove(struct raft *r,
+		struct raft_change *req,
+		raft_id id,
+		raft_change_cb cb)
+{
+	const struct raft_server *server;
+	struct raft_configuration configuration;
+	int rv;
+
+	rv = membershipCanChangeConfiguration(r);
+	if (rv != 0) {
+		return rv;
+	}
+
+	server = configurationGet(&r->configuration, id);
+	if (server == NULL) {
+		rv = RAFT_BADID;
+		goto err;
+	}
+
+	tracef("remove server: id %llu", id);
+
+	/* Make a copy of the current configuration, and remove the given server
+	 * from it. */
+	rv = configurationCopy(&r->configuration, &configuration);
+	if (rv != 0) {
+		goto err;
+	}
+
+	rv = configurationRemove(&configuration, id);
+	if (rv != 0) {
+		goto err_after_configuration_copy;
+	}
+
+	req->cb = cb;
+
+	rv = clientChangeConfiguration(r, req, &configuration);
+	if (rv != 0) {
+		goto err_after_configuration_copy;
+	}
+
+	assert(r->leader_state.change == NULL);
+	r->leader_state.change = req;
+
+	return 0;
+
+err_after_configuration_copy:
+	raft_configuration_close(&configuration);
+
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+/* Find a suitable voting follower. */
+static raft_id clientSelectTransferee(struct raft *r)
+{
+	const struct raft_server *transferee = NULL;
+	unsigned i;
+
+	for (i = 0; i < r->configuration.n; i++) {
+		const struct raft_server *server = &r->configuration.servers[i];
+		if (server->id == r->id || server->role != RAFT_VOTER) {
+			continue;
+		}
+		transferee = server;
+		if (progressIsUpToDate(r, i)) {
+			break;
+		}
+	}
+
+	if (transferee != NULL) {
+		return transferee->id;
+	}
+
+	return 0;
+}
+
+int raft_transfer(struct raft *r,
+		  struct raft_transfer *req,
+		  raft_id id,
+		  raft_transfer_cb cb)
+{
+	const struct raft_server *server;
+	unsigned i;
+	int rv;
+
+	tracef("transfer to %llu", id);
+	if (r->state != RAFT_LEADER || r->transfer != NULL) {
+		tracef("transfer error - state:%d", r->state);
+		rv = RAFT_NOTLEADER;
+		ErrMsgFromCode(r->errmsg, rv);
+		goto err;
+	}
+
+	if (id == 0) {
+		id = clientSelectTransferee(r);
+		if (id == 0) {
+			rv = RAFT_NOTFOUND;
+			ErrMsgPrintf(r->errmsg,
+				     "there's no other voting server");
+			goto err;
+		}
+	}
+
+	server = configurationGet(&r->configuration, id);
+	if (server == NULL || server->id == r->id ||
+	    server->role != RAFT_VOTER) {
+		rv = RAFT_BADID;
+		ErrMsgFromCode(r->errmsg, rv);
+		goto err;
+	}
+
+	/* If this follower is up-to-date, we can send it the TimeoutNow message
+	 * right away. */
+	i = configurationIndexOf(&r->configuration, server->id);
+	assert(i < r->configuration.n);
+
+	membershipLeadershipTransferInit(r, req, id, cb);
+
+	if (progressPersistedIsUpToDate(r, i)) {
+		rv = membershipLeadershipTransferStart(r);
+		if (rv != 0) {
+			r->transfer = NULL;
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+#undef tracef
diff --git a/src/raft/compress.c b/src/raft/compress.c
new file mode 100644
index 000000000..5297f4cd9
--- /dev/null
+++ b/src/raft/compress.c
@@ -0,0 +1,277 @@
+#include "compress.h"
+
+#ifdef LZ4_AVAILABLE
+#include <lz4frame.h>
+#endif
+#include <limits.h>
+#include <string.h>
+
+#include "assert.h"
+#include "byte.h"
+#include "err.h"
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#define MEGABYTE 1048576
+
+int Compress(struct raft_buffer bufs[],
+	     unsigned n_bufs,
+	     struct raft_buffer *compressed,
+	     char *errmsg)
+{
+#ifndef LZ4_AVAILABLE
+	(void)bufs;
+	(void)n_bufs;
+	(void)compressed;
+	ErrMsgPrintf(errmsg, "LZ4 not available");
+	return RAFT_INVALID;
+#else
+	assert(bufs != NULL);
+	assert(n_bufs > 0);
+	assert(compressed != NULL);
+	assert(errmsg != NULL);
+
+	int rv = RAFT_IOERR;
+	size_t src_size = 0;
+	size_t dst_size = 0;
+	size_t src_offset = 0;
+	size_t dst_offset = 0;
+	size_t dst_size_needed = 0; /* Store minimal dst_size */
+	size_t ret = 0;             /* Return value of LZ4F_XXX functions */
+	compressed->base = NULL;
+	compressed->len = 0;
+
+	/* Determine total uncompressed size */
+	for (unsigned i = 0; i < n_bufs; ++i) {
+		src_size += bufs[i].len;
+	}
+
+	/* Work around a bug in liblz4 on bionic, in practice raft should only
+	 * Compress non-0 length buffers, so this should be fine.
+	 * https://github.com/lz4/lz4/issues/157
+	 * */
+	if (src_size == 0) {
+		ErrMsgPrintf(errmsg, "total size must be larger then 0");
+		rv = RAFT_INVALID;
+		goto err;
+	}
+
+	/* Set LZ4 preferences */
+	LZ4F_preferences_t lz4_pref;
+	memset(&lz4_pref, 0, sizeof(lz4_pref));
+	/* Detect data corruption when decompressing */
+	lz4_pref.frameInfo.contentChecksumFlag = 1;
+	/* For allocating a suitable buffer when decompressing */
+	lz4_pref.frameInfo.contentSize = src_size;
+
+	/* Context to track compression progress */
+	LZ4F_compressionContext_t ctx;
+	ret = LZ4F_createCompressionContext(&ctx, LZ4F_VERSION);
+	if (LZ4F_isError(ret)) {
+		ErrMsgPrintf(errmsg, "LZ4F_createDecompressionContext %s",
+			     LZ4F_getErrorName(ret));
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	/* Guestimate of eventual compressed size, mainly not to allocate a huge
+	 * buffer as `LZ4F_compressBound` calculates the worst case scenario. */
+	dst_size = LZ4F_compressBound(
+	    max(MEGABYTE, (size_t)lz4_pref.frameInfo.contentSize / 10),
+	    &lz4_pref);
+	dst_size += LZ4F_HEADER_SIZE_MAX_RAFT;
+	compressed->base = raft_malloc(dst_size);
+	if (compressed->base == NULL) {
+		rv = RAFT_NOMEM;
+		goto err_after_ctx_alloc;
+	}
+
+	/* Returns the size of the lz4 header, data should be written after the
+	 * header */
+	dst_offset =
+	    LZ4F_compressBegin(ctx, compressed->base, dst_size, &lz4_pref);
+	if (LZ4F_isError(dst_offset)) {
+		ErrMsgPrintf(errmsg, "LZ4F_compressBegin %s",
+			     LZ4F_getErrorName(dst_offset));
+		rv = RAFT_IOERR;
+		goto err_after_buff_alloc;
+	}
+
+	/* Compress all buffers */
+	for (unsigned i = 0; i < n_bufs; ++i) {
+		src_offset = 0;
+		while (src_offset < bufs[i].len) {
+			/* Compress in chunks of maximum 1MB and check if there
+			 * is enough room in the dst buffer, if not realloc */
+			src_size =
+			    min(bufs[i].len - src_offset, (size_t)MEGABYTE);
+			dst_size_needed =
+			    LZ4F_compressBound(src_size, &lz4_pref);
+			if (dst_size - dst_offset < dst_size_needed) {
+				dst_size +=
+				    max(dst_size_needed,
+					(size_t)lz4_pref.frameInfo.contentSize /
+					    10);
+				compressed->base =
+				    raft_realloc(compressed->base, dst_size);
+				if (compressed->base == NULL) {
+					rv = RAFT_NOMEM;
+					goto err_after_ctx_alloc;
+				}
+			}
+			/* There is guaranteed enough room in `dst` to perform
+			 * the compression */
+			ret = LZ4F_compressUpdate(
+			    ctx, (char *)compressed->base + dst_offset,
+			    dst_size - dst_offset,
+			    (char *)bufs[i].base + src_offset, src_size, NULL);
+			if (LZ4F_isError(ret)) {
+				ErrMsgPrintf(errmsg, "LZ4F_compressUpdate %s",
+					     LZ4F_getErrorName(ret));
+				rv = RAFT_IOERR;
+				goto err_after_buff_alloc;
+			}
+			dst_offset += ret;
+			src_offset += src_size;
+		}
+	}
+
+	/* Make sure LZ4F_compressEnd has enough room to succeed */
+	dst_size_needed = LZ4F_compressBound(0, &lz4_pref);
+	if ((dst_size - dst_offset) < dst_size_needed) {
+		dst_size += dst_size_needed;
+		compressed->base = raft_realloc(compressed->base, dst_size);
+		if (compressed->base == NULL) {
+			rv = RAFT_NOMEM;
+			goto err_after_ctx_alloc;
+		}
+	}
+
+	/* Finalize compression */
+	ret = LZ4F_compressEnd(ctx, (char *)compressed->base + dst_offset,
+			       dst_size - dst_offset, NULL);
+	if (LZ4F_isError(ret)) {
+		ErrMsgPrintf(errmsg, "LZ4F_compressEnd %s",
+			     LZ4F_getErrorName(ret));
+		rv = RAFT_IOERR;
+		goto err_after_buff_alloc;
+	}
+
+	dst_offset += ret;
+	compressed->len = dst_offset;
+
+	LZ4F_freeCompressionContext(ctx);
+	return 0;
+
+err_after_buff_alloc:
+	raft_free(compressed->base);
+	compressed->base = NULL;
+err_after_ctx_alloc:
+	LZ4F_freeCompressionContext(ctx);
+err:
+	return rv;
+#endif /* LZ4_AVAILABLE */
+}
+
+int Decompress(struct raft_buffer buf,
+	       struct raft_buffer *decompressed,
+	       char *errmsg)
+{
+#ifndef LZ4_AVAILABLE
+	(void)buf;
+	(void)decompressed;
+	ErrMsgPrintf(errmsg, "LZ4 not available");
+	return RAFT_INVALID;
+#else
+	assert(decompressed != NULL);
+
+	int rv = RAFT_IOERR;
+	size_t src_offset = 0;
+	size_t dst_offset = 0;
+	size_t src_size = 0;
+	size_t dst_size = 0;
+	size_t ret = 0;
+
+	LZ4F_decompressionContext_t ctx;
+	if (LZ4F_isError(LZ4F_createDecompressionContext(&ctx, LZ4F_VERSION))) {
+		ErrMsgPrintf(errmsg, "LZ4F_createDecompressionContext");
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	src_size = buf.len;
+	LZ4F_frameInfo_t frameInfo = {0};
+	/* `src_size` will contain the size of the LZ4 Frame Header after the
+	 * call, decompression must resume at that offset. */
+	ret = LZ4F_getFrameInfo(ctx, &frameInfo, buf.base, &src_size);
+	if (LZ4F_isError(ret)) {
+		ErrMsgPrintf(errmsg, "LZ4F_getFrameInfo %s",
+			     LZ4F_getErrorName(ret));
+		rv = RAFT_IOERR;
+		goto err_after_ctx_alloc;
+	}
+	src_offset = src_size;
+
+	decompressed->base = raft_malloc((size_t)frameInfo.contentSize);
+	decompressed->len = (size_t)frameInfo.contentSize;
+	if (decompressed->base == NULL) {
+		rv = RAFT_NOMEM;
+		goto err_after_ctx_alloc;
+	}
+
+	ret = 1;
+	while (ret != 0) {
+		src_size = buf.len - src_offset;
+		/* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+		 * The next line works around a bug in an older lz4 lib where
+		 * the `size_t` dst_size parameter would overflow an `int`.
+		 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+		 */
+		dst_size = min(decompressed->len - dst_offset, (size_t)INT_MAX);
+		/* `dst_size` will contain the number of bytes written to
+		 * decompressed->base, while `src_size` will contain the number
+		 * of bytes consumed from buf.base */
+		ret = LZ4F_decompress(
+		    ctx, (char *)decompressed->base + dst_offset, &dst_size,
+		    (char *)buf.base + src_offset, &src_size, NULL);
+		if (LZ4F_isError(ret)) {
+			ErrMsgPrintf(errmsg, "LZ4F_decompress %s",
+				     LZ4F_getErrorName(ret));
+			rv = RAFT_IOERR;
+			goto err_after_buff_alloc;
+		}
+		src_offset += src_size;
+		dst_offset += dst_size;
+	}
+
+	if (LZ4F_freeDecompressionContext(ctx) != 0) {
+		raft_free(decompressed->base);
+		decompressed->base = NULL;
+		return RAFT_IOERR;
+	}
+
+	return 0;
+
+err_after_buff_alloc:
+	raft_free(decompressed->base);
+	decompressed->base = NULL;
+err_after_ctx_alloc:
+	LZ4F_freeDecompressionContext(ctx);
+err:
+	return rv;
+#endif /* LZ4_AVAILABLE */
+}
+
+bool IsCompressed(const void *data, size_t sz)
+{
+	if (data == NULL || sz < 4) {
+		return false;
+	}
+	const void *cursor = data;
+#ifdef LZ4F_MAGICNUMBER
+#define RAFT_LZ4F_MAGICNUMBER LZ4F_MAGICNUMBER
+#else
+#define RAFT_LZ4F_MAGICNUMBER 0x184D2204U
+#endif
+	return byteGet32(&cursor) == RAFT_LZ4F_MAGICNUMBER;
+}
diff --git a/src/raft/compress.h b/src/raft/compress.h
new file mode 100644
index 000000000..b36379fcc
--- /dev/null
+++ b/src/raft/compress.h
@@ -0,0 +1,34 @@
+#ifndef COMPRESS_H_
+#define COMPRESS_H_
+
+#include "../raft.h"
+
+#ifdef LZ4F_HEADER_SIZE_MAX
+#define LZ4F_HEADER_SIZE_MAX_RAFT LZ4F_HEADER_SIZE_MAX
+#else
+#define LZ4F_HEADER_SIZE_MAX_RAFT 19UL
+#endif
+
+/*
+ * Compresses the content of `bufs` into a newly allocated buffer that is
+ * returned to the caller through `compressed`. Returns a non-0 value upon
+ * failure.
+ */
+int Compress(struct raft_buffer bufs[],
+	     unsigned n_bufs,
+	     struct raft_buffer *compressed,
+	     char *errmsg);
+
+/*
+ * Decompresses the content of `buf` into a newly allocated buffer that is
+ * returned to the caller through `decompressed`. Returns a non-0 value upon
+ * failure.
+ */
+int Decompress(struct raft_buffer buf,
+	       struct raft_buffer *decompressed,
+	       char *errmsg);
+
+/* Returns `true` if `data` is compressed, `false` otherwise. */
+bool IsCompressed(const void *data, size_t sz);
+
+#endif /* COMPRESS_H_ */
diff --git a/src/raft/configuration.c b/src/raft/configuration.c
new file mode 100644
index 000000000..04ca13764
--- /dev/null
+++ b/src/raft/configuration.c
@@ -0,0 +1,401 @@
+#include "configuration.h"
+
+#include "../tracing.h"
+#include "assert.h"
+#include "byte.h"
+
+/* Current encoding format version. */
+#define ENCODING_FORMAT 1
+
+void configurationInit(struct raft_configuration *c)
+{
+	c->servers = NULL;
+	c->n = 0;
+}
+
+void configurationClose(struct raft_configuration *c)
+{
+	size_t i;
+	assert(c != NULL);
+	assert(c->n == 0 || c->servers != NULL);
+	for (i = 0; i < c->n; i++) {
+		raft_free(c->servers[i].address);
+	}
+	if (c->servers != NULL) {
+		raft_free(c->servers);
+	}
+}
+
+unsigned configurationIndexOf(const struct raft_configuration *c,
+			      const raft_id id)
+{
+	unsigned i;
+	assert(c != NULL);
+	for (i = 0; i < c->n; i++) {
+		if (c->servers[i].id == id) {
+			return i;
+		}
+	}
+	return c->n;
+}
+
+unsigned configurationIndexOfVoter(const struct raft_configuration *c,
+				   const raft_id id)
+{
+	unsigned i;
+	unsigned j = 0;
+	assert(c != NULL);
+	assert(id > 0);
+
+	for (i = 0; i < c->n; i++) {
+		if (c->servers[i].id == id) {
+			if (c->servers[i].role == RAFT_VOTER) {
+				return j;
+			}
+			return c->n;
+		}
+		if (c->servers[i].role == RAFT_VOTER) {
+			j++;
+		}
+	}
+
+	return c->n;
+}
+
+const struct raft_server *configurationGet(const struct raft_configuration *c,
+					   const raft_id id)
+{
+	size_t i;
+	assert(c != NULL);
+	assert(id > 0);
+
+	/* Grab the index of the server with the given ID */
+	i = configurationIndexOf(c, id);
+
+	if (i == c->n) {
+		/* No server with matching ID. */
+		return NULL;
+	}
+	assert(i < c->n);
+
+	return &c->servers[i];
+}
+
+unsigned configurationVoterCount(const struct raft_configuration *c)
+{
+	unsigned i;
+	unsigned n = 0;
+	assert(c != NULL);
+	for (i = 0; i < c->n; i++) {
+		if (c->servers[i].role == RAFT_VOTER) {
+			n++;
+		}
+	}
+	return n;
+}
+
+int configurationCopy(const struct raft_configuration *src,
+		      struct raft_configuration *dst)
+{
+	size_t i;
+	int rv;
+
+	configurationInit(dst);
+	for (i = 0; i < src->n; i++) {
+		struct raft_server *server = &src->servers[i];
+		rv = configurationAdd(dst, server->id, server->address,
+				      server->role);
+		if (rv != 0) {
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	configurationClose(dst);
+	assert(rv == RAFT_NOMEM);
+	return rv;
+}
+
+int configurationAdd(struct raft_configuration *c,
+		     raft_id id,
+		     const char *address,
+		     int role)
+{
+	struct raft_server *servers;
+	struct raft_server *server;
+	char *address_copy;
+	size_t i;
+	int rv;
+	assert(c != NULL);
+	assert(id != 0);
+
+	if (role != RAFT_STANDBY && role != RAFT_VOTER && role != RAFT_SPARE) {
+		rv = RAFT_BADROLE;
+		goto err;
+	}
+
+	/* Check that neither the given id or address is already in use */
+	for (i = 0; i < c->n; i++) {
+		server = &c->servers[i];
+		if (server->id == id) {
+			rv = RAFT_DUPLICATEID;
+			goto err;
+		}
+		if (strcmp(server->address, address) == 0) {
+			rv = RAFT_DUPLICATEADDRESS;
+			goto err;
+		}
+	}
+
+	/* Make a copy of the given address */
+	address_copy = raft_malloc(strlen(address) + 1);
+	if (address_copy == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+	strcpy(address_copy, address);
+
+	/* Grow the servers array.. */
+	servers = raft_realloc(c->servers, (c->n + 1) * sizeof *server);
+	if (servers == NULL) {
+		rv = RAFT_NOMEM;
+		goto err_after_address_copy;
+	}
+	c->servers = servers;
+
+	/* Fill the newly allocated slot (the last one) with the given details.
+	 */
+	server = &servers[c->n];
+	server->id = id;
+	server->address = address_copy;
+	server->role = role;
+
+	c->n++;
+
+	return 0;
+
+err_after_address_copy:
+	raft_free(address_copy);
+err:
+	assert(rv == RAFT_BADROLE || rv == RAFT_DUPLICATEID ||
+	       rv == RAFT_DUPLICATEADDRESS || rv == RAFT_NOMEM);
+	return rv;
+}
+
+int configurationRemove(struct raft_configuration *c, const raft_id id)
+{
+	unsigned i;
+	unsigned j;
+	struct raft_server *servers;
+	int rv;
+
+	assert(c != NULL);
+
+	i = configurationIndexOf(c, id);
+	if (i == c->n) {
+		rv = RAFT_BADID;
+		goto err;
+	}
+
+	assert(i < c->n);
+
+	/* If this is the last server in the configuration, reset everything. */
+	if (c->n - 1 == 0) {
+		assert(i == 0);
+		servers = NULL;
+		goto out;
+	}
+
+	/* Create a new servers array. */
+	servers = raft_calloc(c->n - 1, sizeof *servers);
+	if (servers == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	/* Copy the first part of the servers array into a new array, excluding
+	 * the i'th server. */
+	for (j = 0; j < i; j++) {
+		servers[j] = c->servers[j];
+	}
+
+	/* Copy the second part of the servers array into a new array. */
+	for (j = i + 1; j < c->n; j++) {
+		servers[j - 1] = c->servers[j];
+	}
+
+out:
+	/* Release the address of the server that was deleted. */
+	raft_free(c->servers[i].address);
+
+	/* Release the old servers array */
+	raft_free(c->servers);
+
+	c->servers = servers;
+	c->n--;
+
+	return 0;
+
+err:
+	assert(rv == RAFT_BADID || rv == RAFT_NOMEM);
+	return rv;
+}
+
+size_t configurationEncodedSize(const struct raft_configuration *c)
+{
+	size_t n = 0;
+	unsigned i;
+
+	/* We need one byte for the encoding format version */
+	n++;
+
+	/* Then 8 bytes for number of servers. */
+	n += sizeof(uint64_t);
+
+	/* Then some space for each server. */
+	for (i = 0; i < c->n; i++) {
+		struct raft_server *server = &c->servers[i];
+		assert(server->address != NULL);
+		n += sizeof(uint64_t);            /* Server ID */
+		n += strlen(server->address) + 1; /* Address */
+		n++;                              /* Voting flag */
+	};
+
+	return bytePad64(n);
+}
+
+void configurationEncodeToBuf(const struct raft_configuration *c, void *buf)
+{
+	void *cursor = buf;
+	unsigned i;
+
+	/* Encoding format version */
+	bytePut8(&cursor, ENCODING_FORMAT);
+
+	/* Number of servers. */
+	bytePut64(&cursor, c->n);
+
+	for (i = 0; i < c->n; i++) {
+		struct raft_server *server = &c->servers[i];
+		assert(server->address != NULL);
+		bytePut64(&cursor, server->id);
+		bytePutString(&cursor, server->address);
+		assert(server->role < 255);
+		bytePut8(&cursor, (uint8_t)server->role);
+	};
+}
+
+int configurationEncode(const struct raft_configuration *c,
+			struct raft_buffer *buf)
+{
+	int rv;
+
+	assert(c != NULL);
+	assert(buf != NULL);
+
+	/* The configuration can't be empty. */
+	assert(c->n > 0);
+
+	buf->len = configurationEncodedSize(c);
+	buf->base = raft_malloc(buf->len);
+	if (buf->base == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	configurationEncodeToBuf(c, buf->base);
+
+	return 0;
+
+err:
+	assert(rv == RAFT_NOMEM);
+	return rv;
+}
+
+int configurationDecode(const struct raft_buffer *buf,
+			struct raft_configuration *c)
+{
+	const void *cursor;
+	size_t i;
+	size_t n;
+	int rv;
+
+	assert(c != NULL);
+	assert(buf != NULL);
+
+	/* TODO: use 'if' instead of assert for checking buffer boundaries */
+	assert(buf->len > 0);
+
+	configurationInit(c);
+
+	cursor = buf->base;
+
+	/* Check the encoding format version */
+	if (byteGet8(&cursor) != ENCODING_FORMAT) {
+		rv = RAFT_MALFORMED;
+		goto err;
+	}
+
+	/* Read the number of servers. */
+	n = (size_t)byteGet64(&cursor);
+
+	/* Decode the individual servers. */
+	for (i = 0; i < n; i++) {
+		raft_id id;
+		const char *address;
+		int role;
+
+		/* Server ID. */
+		id = byteGet64(&cursor);
+
+		/* Server Address. */
+		address = byteGetString(
+		    &cursor, buf->len - (size_t)((uint8_t *)cursor -
+						 (uint8_t *)buf->base));
+		if (address == NULL) {
+			rv = RAFT_MALFORMED;
+			goto err;
+		}
+
+		/* Role code. */
+		role = byteGet8(&cursor);
+
+		rv = configurationAdd(c, id, address, role);
+		if (rv != 0) {
+			/* Only valid configurations should be ever be encoded,
+			 * so in case configurationAdd() fails because of
+			 * invalid data we return RAFT_MALFORMED. */
+			if (rv != RAFT_NOMEM) {
+				rv = RAFT_MALFORMED;
+			}
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	assert(rv == RAFT_MALFORMED || rv == RAFT_NOMEM);
+	configurationClose(c);
+	return rv;
+}
+
+void configurationTrace(const struct raft *r,
+			struct raft_configuration *c,
+			const char *msg)
+{
+	(void)r;
+	tracef("%s", msg);
+	tracef("=== CONFIG START ===");
+	unsigned i;
+	struct raft_server *s;
+	for (i = 0; i < c->n; i++) {
+		s = &c->servers[i];
+		tracef("id:%llu address:%s role:%d", s->id, s->address,
+		       s->role);
+	}
+	tracef("=== CONFIG END ===");
+}
+#undef tracef
diff --git a/src/raft/configuration.h b/src/raft/configuration.h
new file mode 100644
index 000000000..dc1429c9b
--- /dev/null
+++ b/src/raft/configuration.h
@@ -0,0 +1,131 @@
+/* Modify and inspect @raft_configuration objects. */
+
+#ifndef CONFIGURATION_H_
+#define CONFIGURATION_H_
+
+#include "../raft.h"
+
+/* Initialize an empty configuration. */
+void configurationInit(struct raft_configuration *c);
+
+/* Release all memory used by the given configuration. */
+void configurationClose(struct raft_configuration *c);
+
+/* Add a server to the given configuration.
+ *
+ * The given @address is copied and no reference to it is kept. In case of
+ * error, @c is left unchanged.
+ *
+ * Errors:
+ *
+ * RAFT_DUPLICATEID
+ *     @c already has a server with the given id.
+ *
+ * RAFT_DUPLICATEADDRESS
+ *     @c already has a server with the given @address.
+ *
+ * RAFT_BADROLE
+ *     @role is not one of ROLE_STANDBY, ROLE_VOTER or ROLE_SPARE.
+ *
+ * RAFT_NOMEM
+ *     A copy of @address could not me made or the @c->servers could not
+ *     be extended
+ */
+int configurationAdd(struct raft_configuration *c,
+		     raft_id id,
+		     const char *address,
+		     int role);
+
+/* Return the number of servers with the RAFT_VOTER role. */
+unsigned configurationVoterCount(const struct raft_configuration *c);
+
+/* Return the index of the server with the given ID (relative to the c->servers
+ * array). If there's no server with the given ID, return the number of
+ * servers. */
+unsigned configurationIndexOf(const struct raft_configuration *c, raft_id id);
+
+/* Return the index of the RAFT_VOTER server with the given ID (relative to the
+ * sub array of c->servers that has only voting servers). If there's no server
+ * with the given ID, or if it's not flagged as voting, return the number of
+ * servers. */
+unsigned configurationIndexOfVoter(const struct raft_configuration *c,
+				   raft_id id);
+
+/* Get the server with the given ID, or #NULL if no matching server is found. */
+const struct raft_server *configurationGet(const struct raft_configuration *c,
+					   raft_id id);
+
+/* Remove a server from a raft configuration. The given ID must match the one of
+ * an existing server in the configuration.
+ *
+ * In case of error @c is left unchanged.
+ *
+ * Errors:
+ *
+ * RAFT_BADID
+ *     @c does not contain any server with the given @id
+ *
+ * RAFT_NOMEM
+ *     Memory to hold the new set of servers could not be allocated.
+ */
+int configurationRemove(struct raft_configuration *c, raft_id id);
+
+/* Deep copy @src to @dst.
+ *
+ * The configuration @src is assumed to be valid (i.e. each of its servers has a
+ * valid ID, address and role).
+ *
+ * The @dst configuration object must be uninitialized or empty.
+ *
+ * In case of error, both @src and @dst are left unchanged.
+ *
+ * Errors:
+ *
+ * RAFT_NOMEM
+ *     Memory to copy all the servers could not be allocated.
+ */
+int configurationCopy(const struct raft_configuration *src,
+		      struct raft_configuration *dst);
+
+/* Number of bytes needed to encode the given configuration object. */
+size_t configurationEncodedSize(const struct raft_configuration *c);
+
+/* Encode the given configuration object to the given pre-allocated buffer,
+ * which is assumed to be at least configurationEncodedSize(c) bytes. */
+void configurationEncodeToBuf(const struct raft_configuration *c, void *buf);
+
+/* Encode the given configuration object. The memory of the returned buffer is
+ * allocated using raft_malloc(), and client code is responsible for releasing
+ * it when no longer needed.
+ *
+ * Errors:
+ *
+ * RAFT_NOMEM
+ *     Memory for the encoded buffer could not be allocated.
+ */
+int configurationEncode(const struct raft_configuration *c,
+			struct raft_buffer *buf);
+
+/* Populate a configuration object by decoding the given serialized payload.
+ *
+ * The @c configuration object must be uninitialized or empty.
+ *
+ * In case of error, @c will be left empty.
+ *
+ * Errors:
+ *
+ * RAFT_MALFORMED
+ *     The given buffer does not contain a valid encoded configuration.
+ *
+ * RAFT_NOMEM
+ *     Memory to populate the given configuration could not be allocated.
+ */
+int configurationDecode(const struct raft_buffer *buf,
+			struct raft_configuration *c);
+
+/* Output the configuration to the raft tracer */
+void configurationTrace(const struct raft *r,
+			struct raft_configuration *c,
+			const char *msg);
+
+#endif /* CONFIGURATION_H_ */
diff --git a/src/raft/convert.c b/src/raft/convert.c
new file mode 100644
index 000000000..1c4d52d25
--- /dev/null
+++ b/src/raft/convert.c
@@ -0,0 +1,271 @@
+#include "convert.h"
+
+#include "../raft.h"
+#include "../tracing.h"
+#include "assert.h"
+#include "callbacks.h"
+#include "configuration.h"
+#include "election.h"
+#include "log.h"
+#include "membership.h"
+#include "progress.h"
+#include "queue.h"
+#include "replication.h"
+#include "request.h"
+
+/* Convenience for setting a new state value and asserting that the transition
+ * is valid. */
+static void convertSetState(struct raft *r, unsigned short new_state)
+{
+	/* Check that the transition is legal, see Figure 3.3. Note that with
+	 * respect to the paper we have an additional "unavailable" state, which
+	 * is the initial or final state. */
+	unsigned short old_state = r->state;
+	tracef("old_state:%u new_state:%u", old_state, new_state);
+	assert((r->state == RAFT_UNAVAILABLE && new_state == RAFT_FOLLOWER) ||
+	       (r->state == RAFT_FOLLOWER && new_state == RAFT_CANDIDATE) ||
+	       (r->state == RAFT_CANDIDATE && new_state == RAFT_FOLLOWER) ||
+	       (r->state == RAFT_CANDIDATE && new_state == RAFT_LEADER) ||
+	       (r->state == RAFT_LEADER && new_state == RAFT_FOLLOWER) ||
+	       (r->state == RAFT_FOLLOWER && new_state == RAFT_UNAVAILABLE) ||
+	       (r->state == RAFT_CANDIDATE && new_state == RAFT_UNAVAILABLE) ||
+	       (r->state == RAFT_LEADER && new_state == RAFT_UNAVAILABLE));
+	r->state = new_state;
+	if (r->state == RAFT_LEADER) {
+		r->leader_state.voter_contacts = 1;
+	}
+
+	struct raft_callbacks *cbs = raftGetCallbacks(r);
+	if (cbs != NULL && cbs->state_cb != NULL) {
+		cbs->state_cb(r, old_state, new_state);
+	}
+}
+
+/* Clear follower state. */
+static void convertClearFollower(struct raft *r)
+{
+	tracef("clear follower state");
+	r->follower_state.current_leader.id = 0;
+	if (r->follower_state.current_leader.address != NULL) {
+		raft_free(r->follower_state.current_leader.address);
+	}
+	r->follower_state.current_leader.address = NULL;
+}
+
+/* Clear candidate state. */
+static void convertClearCandidate(struct raft *r)
+{
+	tracef("clear candidate state");
+	if (r->candidate_state.votes != NULL) {
+		raft_free(r->candidate_state.votes);
+		r->candidate_state.votes = NULL;
+	}
+}
+
+static void convertFailApply(struct raft_apply *req)
+{
+	if (req != NULL && req->cb != NULL) {
+		req->cb(req, RAFT_LEADERSHIPLOST, NULL);
+	}
+}
+
+static void convertFailBarrier(struct raft_barrier *req)
+{
+	if (req != NULL && req->cb != NULL) {
+		req->cb(req, RAFT_LEADERSHIPLOST);
+	}
+}
+
+static void convertFailChange(struct raft_change *req)
+{
+	if (req != NULL && req->cb != NULL) {
+		req->cb(req, RAFT_LEADERSHIPLOST);
+	}
+}
+
+/* Clear leader state. */
+static void convertClearLeader(struct raft *r)
+{
+	tracef("clear leader state");
+	if (r->leader_state.progress != NULL) {
+		raft_free(r->leader_state.progress);
+		r->leader_state.progress = NULL;
+	}
+
+	/* Fail all outstanding requests */
+	while (!QUEUE_IS_EMPTY(&r->leader_state.requests)) {
+		struct request *req;
+		queue *head;
+		head = QUEUE_HEAD(&r->leader_state.requests);
+		QUEUE_REMOVE(head);
+		req = QUEUE_DATA(head, struct request, queue);
+		assert(req->type == RAFT_COMMAND || req->type == RAFT_BARRIER);
+		switch (req->type) {
+			case RAFT_COMMAND:
+				convertFailApply((struct raft_apply *)req);
+				break;
+			case RAFT_BARRIER:
+				convertFailBarrier((struct raft_barrier *)req);
+				break;
+		};
+	}
+
+	/* Fail any promote request that is still outstanding because the server
+	 * is still catching up and no entry was submitted. */
+	if (r->leader_state.change != NULL) {
+		convertFailChange(r->leader_state.change);
+		r->leader_state.change = NULL;
+	}
+}
+
+/* Clear the current state */
+static void convertClear(struct raft *r)
+{
+	assert(r->state == RAFT_UNAVAILABLE || r->state == RAFT_FOLLOWER ||
+	       r->state == RAFT_CANDIDATE || r->state == RAFT_LEADER);
+	switch (r->state) {
+		case RAFT_FOLLOWER:
+			convertClearFollower(r);
+			break;
+		case RAFT_CANDIDATE:
+			convertClearCandidate(r);
+			break;
+		case RAFT_LEADER:
+			convertClearLeader(r);
+			break;
+	}
+}
+
+void convertToFollower(struct raft *r)
+{
+	convertClear(r);
+	convertSetState(r, RAFT_FOLLOWER);
+
+	/* Reset election timer. */
+	electionResetTimer(r);
+
+	r->follower_state.current_leader.id = 0;
+	r->follower_state.current_leader.address = NULL;
+	r->follower_state.append_in_flight_count = 0;
+}
+
+int convertToCandidate(struct raft *r, bool disrupt_leader)
+{
+	const struct raft_server *server;
+	size_t n_voters = configurationVoterCount(&r->configuration);
+	int rv;
+
+	(void)server; /* Only used for assertions. */
+
+	convertClear(r);
+	convertSetState(r, RAFT_CANDIDATE);
+
+	/* Allocate the votes array. */
+	r->candidate_state.votes = raft_malloc(n_voters * sizeof(bool));
+	if (r->candidate_state.votes == NULL) {
+		return RAFT_NOMEM;
+	}
+	r->candidate_state.disrupt_leader = disrupt_leader;
+	r->candidate_state.in_pre_vote = disrupt_leader ? false : r->pre_vote;
+
+	/* Fast-forward to leader if we're the only voting server in the
+	 * configuration. */
+	server = configurationGet(&r->configuration, r->id);
+	assert(server != NULL);
+	assert(server->role == RAFT_VOTER);
+
+	if (n_voters == 1) {
+		tracef("self elect and convert to leader");
+		return convertToLeader(r);
+	}
+
+	/* Start a new election round */
+	rv = electionStart(r);
+	if (rv != 0) {
+		r->state = RAFT_FOLLOWER;
+		raft_free(r->candidate_state.votes);
+		return rv;
+	}
+
+	return 0;
+}
+
+void convertInitialBarrierCb(struct raft_barrier *req, int status)
+{
+	(void)status;
+	raft_free(req);
+}
+
+int convertToLeader(struct raft *r)
+{
+	int rv;
+
+	tracef("become leader for term %llu", r->current_term);
+
+	convertClear(r);
+	convertSetState(r, RAFT_LEADER);
+
+	/* Reset timers */
+	r->election_timer_start = r->io->time(r->io);
+
+	/* Reset apply requests queue */
+	QUEUE_INIT(&r->leader_state.requests);
+
+	/* Allocate and initialize the progress array. */
+	rv = progressBuildArray(r);
+	if (rv != 0) {
+		return rv;
+	}
+
+	r->leader_state.change = NULL;
+
+	/* Reset promotion state. */
+	r->leader_state.promotee_id = 0;
+	r->leader_state.round_number = 0;
+	r->leader_state.round_index = 0;
+	r->leader_state.round_start = 0;
+
+	/* By definition, all entries until the last_stored entry will be
+	 * committed if we are the only voter around. */
+	size_t n_voters = configurationVoterCount(&r->configuration);
+	if (n_voters == 1 && (r->last_stored > r->commit_index)) {
+		tracef("apply log entries after self election %llu %llu",
+		       r->last_stored, r->commit_index);
+		r->commit_index = r->last_stored;
+		rv = replicationApply(r);
+	} else if (n_voters > 1) {
+		/* Raft Dissertation, paragraph 6.4:
+		 * The Leader Completeness Property guarantees that a leader has
+		 * all committed entries, but at the start of its term, it may
+		 * not know which those are. To find out, it needs to commit an
+		 * entry from its term. Raft handles this by having each leader
+		 * commit a blank no-op entry into the log at the start of its
+		 * term. */
+		struct raft_barrier *req = raft_malloc(sizeof(*req));
+		if (req == NULL) {
+			return RAFT_NOMEM;
+		}
+		rv = raft_barrier(r, req, convertInitialBarrierCb);
+		if (rv != 0) {
+			tracef(
+			    "failed to send no-op barrier entry after leader "
+			    "conversion: "
+			    "%d",
+			    rv);
+		}
+	}
+
+	return rv;
+}
+
+void convertToUnavailable(struct raft *r)
+{
+	/* Abort any pending leadership transfer request. */
+	if (r->transfer != NULL) {
+		membershipLeadershipTransferClose(r);
+	}
+	convertClear(r);
+	convertSetState(r, RAFT_UNAVAILABLE);
+}
+
+#undef tracef
diff --git a/src/raft/convert.h b/src/raft/convert.h
new file mode 100644
index 000000000..face1468e
--- /dev/null
+++ b/src/raft/convert.h
@@ -0,0 +1,52 @@
+/* Convert from one state to another. */
+
+#ifndef CONVERT_H_
+#define CONVERT_H_
+
+#include "../raft.h"
+
+/* Convert from unavailable, or candidate or leader to follower.
+ *
+ * From Figure 3.1:
+ *
+ *   If election timeout elapses without receiving AppendEntries RPC from
+ *   current leader or granting vote to candidate: convert to candidate.
+ *
+ * The above implies that we need to reset the election timer when converting to
+ * follower. */
+void convertToFollower(struct raft *r);
+
+/* Convert from follower to candidate, starting a new election.
+ *
+ * From Figure 3.1:
+ *
+ *   On conversion to candidate, start election
+ *
+ * If the disrupt_leader flag is true, the server will set the disrupt leader
+ * flag of the RequestVote messages it sends.  */
+int convertToCandidate(struct raft *r, bool disrupt_leader);
+
+/* Convert from candidate to leader.
+ *
+ * From Figure 3.1:
+ *
+ *   Upon election: send initial empty AppendEntries RPC (heartbeat) to each
+ *   server.
+ *
+ * From Section 3.4:
+ *
+ *   Once a candidate wins an election, it becomes leader. It then sends
+ *   heartbeat messages to all of the other servers to establish its authority
+ *   and prevent new elections.
+ *
+ * From Section 3.3:
+ *
+ *   The leader maintains a nextIndex for each follower, which is the index
+ *   of the next log entry the leader will send to that follower. When a
+ *   leader first comes to power, it initializes all nextIndex values to the
+ *   index just after the last one in its log. */
+int convertToLeader(struct raft *r);
+
+void convertToUnavailable(struct raft *r);
+
+#endif /* CONVERT_H_ */
diff --git a/src/raft/election.c b/src/raft/election.c
new file mode 100644
index 000000000..ecdcd20f0
--- /dev/null
+++ b/src/raft/election.c
@@ -0,0 +1,327 @@
+#include "election.h"
+
+#include "../tracing.h"
+#include "assert.h"
+#include "configuration.h"
+#include "heap.h"
+#include "log.h"
+
+/* Common fields between follower and candidate state.
+ *
+ * The follower_state and candidate_state structs in raft.h must be kept
+ * consistent with this definition. */
+struct followerOrCandidateState
+{
+	unsigned randomized_election_timeout;
+};
+
+/* Return a pointer to either the follower or candidate state. */
+struct followerOrCandidateState *getFollowerOrCandidateState(struct raft *r)
+{
+	struct followerOrCandidateState *state;
+	assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE);
+	if (r->state == RAFT_FOLLOWER) {
+		state = (struct followerOrCandidateState *)&r->follower_state;
+	} else {
+		state = (struct followerOrCandidateState *)&r->candidate_state;
+	}
+	return state;
+}
+
+void electionResetTimer(struct raft *r)
+{
+	struct followerOrCandidateState *state = getFollowerOrCandidateState(r);
+	unsigned timeout = (unsigned)r->io->random(
+	    r->io, (int)r->election_timeout, 2 * (int)r->election_timeout);
+	assert(timeout >= r->election_timeout);
+	assert(timeout <= r->election_timeout * 2);
+	state->randomized_election_timeout = timeout;
+	r->election_timer_start = r->io->time(r->io);
+}
+
+bool electionTimerExpired(struct raft *r)
+{
+	struct followerOrCandidateState *state = getFollowerOrCandidateState(r);
+	raft_time now = r->io->time(r->io);
+	return now - r->election_timer_start >=
+	       state->randomized_election_timeout;
+}
+
+static void sendRequestVoteCb(struct raft_io_send *send, int status)
+{
+	(void)status;
+	RaftHeapFree(send);
+}
+
+/* Send a RequestVote RPC to the given server. */
+static int electionSend(struct raft *r, const struct raft_server *server)
+{
+	struct raft_message message;
+	struct raft_io_send *send;
+	raft_term term;
+	int rv;
+	assert(server->id != r->id);
+	assert(server->id != 0);
+
+	/* If we are in the pre-vote phase, we indicate our future term in the
+	 * request. */
+	term = r->current_term;
+	if (r->candidate_state.in_pre_vote) {
+		term++;
+	}
+
+	/* Fill the RequestVote message.
+	 *
+	 * Note that we set last_log_index and last_log_term to the index and
+	 * term of the last persisted entry, to the last entry in our in-memory
+	 * log cache, because we must advertise only log entries that can't be
+	 * lost at restart.
+	 *
+	 * Also note that, for a similar reason, we apply pending configuration
+	 * changes only once they are persisted. When running an election we
+	 * then use only persisted information, which is safe (while using
+	 * unpersisted information for the log and persisted information for the
+	 * configuration or viceversa would lead to inconsistencies and
+	 * violations of Raft invariants).
+	 */
+	message.type = RAFT_IO_REQUEST_VOTE;
+	message.request_vote.term = term;
+	message.request_vote.candidate_id = r->id;
+	message.request_vote.last_log_index = r->last_stored;
+	message.request_vote.last_log_term = logTermOf(r->log, r->last_stored);
+	message.request_vote.disrupt_leader = r->candidate_state.disrupt_leader;
+	message.request_vote.pre_vote = r->candidate_state.in_pre_vote;
+	message.server_id = server->id;
+	message.server_address = server->address;
+
+	send = RaftHeapMalloc(sizeof *send);
+	if (send == NULL) {
+		return RAFT_NOMEM;
+	}
+
+	send->data = r;
+
+	rv = r->io->send(r->io, send, &message, sendRequestVoteCb);
+	if (rv != 0) {
+		RaftHeapFree(send);
+		return rv;
+	}
+
+	return 0;
+}
+
+int electionStart(struct raft *r)
+{
+	raft_term term;
+	size_t n_voters;
+	size_t voting_index;
+	size_t i;
+	int rv;
+	assert(r->state == RAFT_CANDIDATE);
+
+	n_voters = configurationVoterCount(&r->configuration);
+	voting_index = configurationIndexOfVoter(&r->configuration, r->id);
+
+	/* This function should not be invoked if we are not a voting server,
+	 * hence voting_index must be lower than the number of servers in the
+	 * configuration (meaning that we are a voting server). */
+	assert(voting_index < r->configuration.n);
+
+	/* Coherence check that configurationVoterCount and
+	 * configurationIndexOfVoter have returned something that makes sense.
+	 */
+	assert(n_voters <= r->configuration.n);
+	assert(voting_index < n_voters);
+
+	/* During pre-vote we don't increment our term, or reset our vote.
+	 * Resetting our vote could lead to double-voting if we were to receive
+	 * a RequestVote RPC during our Candidate state while we already voted
+	 * for a server during the term. */
+	if (!r->candidate_state.in_pre_vote) {
+		/* Increment current term */
+		term = r->current_term + 1;
+		rv = r->io->set_term(r->io, term);
+		if (rv != 0) {
+			tracef("set_term failed %d", rv);
+			goto err;
+		}
+		tracef("beginning of term %llu", term);
+
+		/* Vote for self */
+		rv = r->io->set_vote(r->io, r->id);
+		if (rv != 0) {
+			tracef("set_vote self failed %d", rv);
+			goto err;
+		}
+
+		/* Update our cache too. */
+		r->current_term = term;
+		r->voted_for = r->id;
+	}
+
+	/* Reset election timer. */
+	electionResetTimer(r);
+
+	assert(r->candidate_state.votes != NULL);
+
+	/* Initialize the votes array and send vote requests. */
+	for (i = 0; i < n_voters; i++) {
+		if (i == voting_index) {
+			r->candidate_state.votes[i] =
+			    true; /* We vote for ourselves */
+		} else {
+			r->candidate_state.votes[i] = false;
+		}
+	}
+	for (i = 0; i < r->configuration.n; i++) {
+		const struct raft_server *server = &r->configuration.servers[i];
+		if (server->id == r->id || server->role != RAFT_VOTER) {
+			continue;
+		}
+		rv = electionSend(r, server);
+		if (rv != 0) {
+			/* This is not a critical failure, let's just log it. */
+			tracef("failed to send vote request to server %llu: %s",
+			       server->id, raft_strerror(rv));
+		}
+	}
+
+	return 0;
+
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+int electionVote(struct raft *r,
+		 const struct raft_request_vote *args,
+		 bool *granted)
+{
+	const struct raft_server *local_server;
+	raft_index local_last_index;
+	raft_term local_last_term;
+	bool is_transferee; /* Requester is the target of a leadership transfer
+			     */
+	int rv;
+
+	assert(r != NULL);
+	assert(args != NULL);
+	assert(granted != NULL);
+
+	local_server = configurationGet(&r->configuration, r->id);
+
+	*granted = false;
+
+	if (local_server == NULL || local_server->role != RAFT_VOTER) {
+		tracef("local server is not voting -> not granting vote");
+		return 0;
+	}
+
+	is_transferee =
+	    r->transfer != NULL && r->transfer->id == args->candidate_id;
+	if (!args->pre_vote && r->voted_for != 0 &&
+	    r->voted_for != args->candidate_id && !is_transferee) {
+		tracef("local server already voted -> not granting vote");
+		return 0;
+	}
+
+	/* Raft Dissertation 9.6:
+	 * > In the Pre-Vote algorithm, a candidate
+	 * > only increments its term if it first learns from a majority of the
+	 * > cluster that they would be willing
+	 * > to grant the candidate their votes (if the candidate's log is
+	 * > sufficiently up-to-date, and the voters
+	 * > have not received heartbeats from a valid leader for at least a
+	 * baseline > election timeout) Arriving here means that in a pre-vote
+	 * phase, we will cast our vote if the candidate's log is sufficiently
+	 * up-to-date, no matter what the candidate's term is. We have already
+	 * checked if we currently have a leader upon reception of the
+	 * RequestVote RPC, meaning the 2 conditions will be satisfied if the
+	 * candidate's log is up-to-date.
+	 * */
+	local_last_index = logLastIndex(r->log);
+
+	/* Our log is definitely not more up-to-date if it's empty! */
+	if (local_last_index == 0) {
+		tracef("local log is empty -> granting vote");
+		goto grant_vote;
+	}
+
+	local_last_term = logLastTerm(r->log);
+
+	if (args->last_log_term < local_last_term) {
+		/* The requesting server has last entry's log term lower than
+		 * ours. */
+		tracef(
+		    "local last entry %llu has term %llu higher than %llu -> "
+		    "not "
+		    "granting",
+		    local_last_index, local_last_term, args->last_log_term);
+		return 0;
+	}
+
+	if (args->last_log_term > local_last_term) {
+		/* The requesting server has a more up-to-date log. */
+		tracef(
+		    "remote last entry %llu has term %llu higher than %llu -> "
+		    "granting vote",
+		    args->last_log_index, args->last_log_term, local_last_term);
+		goto grant_vote;
+	}
+
+	/* The term of the last log entry is the same, so let's compare the
+	 * length of the log. */
+	assert(args->last_log_term == local_last_term);
+
+	if (local_last_index <= args->last_log_index) {
+		/* Our log is shorter or equal to the one of the requester. */
+		tracef(
+		    "remote log equal or longer than local -> granting vote");
+		goto grant_vote;
+	}
+
+	tracef("remote log shorter than local -> not granting vote");
+
+	return 0;
+
+grant_vote:
+	if (!args->pre_vote) {
+		rv = r->io->set_vote(r->io, args->candidate_id);
+		if (rv != 0) {
+			tracef("set_vote failed %d", rv);
+			return rv;
+		}
+		r->voted_for = args->candidate_id;
+
+		/* Reset the election timer. */
+		r->election_timer_start = r->io->time(r->io);
+	}
+
+	tracef("vote granted to %llu", args->candidate_id);
+	*granted = true;
+
+	return 0;
+}
+
+bool electionTally(struct raft *r, size_t voter_index)
+{
+	size_t n_voters = configurationVoterCount(&r->configuration);
+	size_t votes = 0;
+	size_t i;
+	size_t half = n_voters / 2;
+
+	assert(r->state == RAFT_CANDIDATE);
+	assert(r->candidate_state.votes != NULL);
+
+	r->candidate_state.votes[voter_index] = true;
+
+	for (i = 0; i < n_voters; i++) {
+		if (r->candidate_state.votes[i]) {
+			votes++;
+		}
+	}
+
+	return votes >= half + 1;
+}
+
+#undef tracef
diff --git a/src/raft/election.h b/src/raft/election.h
new file mode 100644
index 000000000..0ead5503a
--- /dev/null
+++ b/src/raft/election.h
@@ -0,0 +1,81 @@
+/* Election-related logic and helpers. */
+
+#ifndef ELECTION_H_
+#define ELECTION_H_
+
+#include "../raft.h"
+
+/* Reset the election_timer clock and set randomized_election_timeout to a
+ * random value between election_timeout and 2 * election_timeout.
+ *
+ * From Section 3.4:
+ *
+ *   Raft uses randomized election timeouts to ensure that split votes are rare
+ *   and that they are resolved quickly. To prevent split votes in the first
+ *   place, election timeouts are chosen randomly from a fixed interval (e.g.,
+ *   150-300 ms). This spreads out the servers so that in most cases only a
+ *   single server will time out.
+ *
+ * From Section 9.4:
+ *
+ *   We used AvailSim to approximate a WAN spanning the continental US. Each
+ *   message was assigned a latency chosen randomly from the uniform range of
+ *   30-40 ms, and the servers' election timeout range was set accordingly to
+ *   300-600 ms (about 10-20 times the one-way network latency). When only one
+ *   of the five servers has failed, the average election completes within about
+ *   475 ms, and 99.9% of elections complete within 1.5 s. Even when two of the
+ *   five servers have failed, the average election takes about 650 ms (about 20
+ *   times the one-way network latency), and 99.9% of elections complete in 3
+ *   s. We believe these election times are more than adequate for most WAN
+ *   deployments.
+ *
+ * Must be called in follower or candidate state. */
+void electionResetTimer(struct raft *r);
+
+/* Return true if the election timer has expired.
+ *
+ * Must be called in follower or candidate state. */
+bool electionTimerExpired(struct raft *r);
+
+/* Start a new election round.
+ *
+ * From Figure 3.1:
+ *
+ *   [Rules for Servers] Candidates: On conversion to candidates, start
+ *   election:
+ *
+ *   - Increment current term
+ *   - Vote for self
+ *   - Reset election timer
+ *   - Send RequestVote RPCs to all other servers
+ *
+ * From Section 3.4:
+ *
+ *   To begin an election, a follower increments its current term and
+ *   transitions to candidate state.  It then votes for itself and issues
+ *   RequestVote RPCs in parallel to each of the other servers in the
+ *   cluster.
+ */
+int electionStart(struct raft *r);
+
+/* Decide whether our vote should be granted to the requesting server and update
+ * our state accordingly.
+ *
+ * From Figure 3.1:
+ *
+ *   RequestVote RPC: Receiver Implementation:
+ *
+ *   - If votedFor is null or candidateId, and candidate's log is at least as
+ *     up-to-date as receiver's log, grant vote.
+ *
+ * The outcome of the decision is stored through the @granted pointer. */
+int electionVote(struct raft *r,
+		 const struct raft_request_vote *args,
+		 bool *granted);
+
+/* Update the votes array by adding the vote from the server at the given
+ * index. Return true if with this vote the server has reached the majority of
+ * votes and won elections. */
+bool electionTally(struct raft *r, size_t voter_index);
+
+#endif /* ELECTION_H_ */
diff --git a/src/raft/entry.c b/src/raft/entry.c
new file mode 100644
index 000000000..15ac56725
--- /dev/null
+++ b/src/raft/entry.c
@@ -0,0 +1,84 @@
+#include <stdint.h>
+#include <string.h>
+
+#include "assert.h"
+#include "entry.h"
+
+void entryBatchesDestroy(struct raft_entry *entries, const size_t n)
+{
+	void *batch = NULL;
+	size_t i;
+	if (entries == NULL) {
+		assert(n == 0);
+		return;
+	}
+	assert(n > 0);
+	for (i = 0; i < n; i++) {
+		assert(entries[i].batch != NULL);
+		if (entries[i].batch != batch) {
+			batch = entries[i].batch;
+			raft_free(batch);
+		}
+	}
+	raft_free(entries);
+}
+
+int entryCopy(const struct raft_entry *src, struct raft_entry *dst)
+{
+	dst->term = src->term;
+	dst->type = src->type;
+	dst->buf.len = src->buf.len;
+	dst->buf.base = raft_malloc(dst->buf.len);
+	if (dst->buf.len > 0 && dst->buf.base == NULL) {
+		return RAFT_NOMEM;
+	}
+	memcpy(dst->buf.base, src->buf.base, dst->buf.len);
+	dst->batch = NULL;
+	return 0;
+}
+
+int entryBatchCopy(const struct raft_entry *src,
+		   struct raft_entry **dst,
+		   const size_t n)
+{
+	size_t size = 0;
+	void *batch;
+	uint8_t *cursor;
+	unsigned i;
+
+	if (n == 0) {
+		*dst = NULL;
+		return 0;
+	}
+
+	/* Calculate the total size of the entries content and allocate the
+	 * batch. */
+	for (i = 0; i < n; i++) {
+		size += src[i].buf.len;
+	}
+
+	batch = raft_malloc(size);
+	if (batch == NULL) {
+		return RAFT_NOMEM;
+	}
+
+	/* Copy the entries. */
+	*dst = raft_malloc(n * sizeof **dst);
+	if (*dst == NULL) {
+		raft_free(batch);
+		return RAFT_NOMEM;
+	}
+
+	cursor = batch;
+
+	for (i = 0; i < n; i++) {
+		(*dst)[i].term = src[i].term;
+		(*dst)[i].type = src[i].type;
+		(*dst)[i].buf.base = cursor;
+		(*dst)[i].buf.len = src[i].buf.len;
+		(*dst)[i].batch = batch;
+		memcpy((*dst)[i].buf.base, src[i].buf.base, src[i].buf.len);
+		cursor += src[i].buf.len;
+	}
+	return 0;
+}
diff --git a/src/raft/entry.h b/src/raft/entry.h
new file mode 100644
index 000000000..b571ebb8c
--- /dev/null
+++ b/src/raft/entry.h
@@ -0,0 +1,19 @@
+#ifndef ENTRY_H_
+#define ENTRY_H_
+
+#include "../raft.h"
+
+/* Release all memory associated with the given entries, including the array
+ * itself. The entries are supposed to belong to one or more batches. */
+void entryBatchesDestroy(struct raft_entry *entries, size_t n);
+
+/* Create a copy of a log entry, including its data. */
+int entryCopy(const struct raft_entry *src, struct raft_entry *dst);
+
+/* Create a single batch of entries containing a copy of the given entries,
+ * including their data. */
+int entryBatchCopy(const struct raft_entry *src,
+		   struct raft_entry **dst,
+		   size_t n);
+
+#endif /* ENTRY_H */
diff --git a/src/raft/err.c b/src/raft/err.c
new file mode 100644
index 000000000..cc6c5cdad
--- /dev/null
+++ b/src/raft/err.c
@@ -0,0 +1,72 @@
+#include "err.h"
+
+#include <string.h>
+
+#include "../raft.h"
+#include "assert.h"
+
+#define WRAP_SEP ": "
+#define WRAP_SEP_LEN ((size_t)strlen(WRAP_SEP))
+
+void errMsgWrap(char *e, const char *format)
+{
+	size_t n = RAFT_ERRMSG_BUF_SIZE;
+	size_t prefix_n;
+	size_t prefix_and_sep_n;
+	size_t trail_n;
+	size_t i;
+
+	/* Calculate the length of the prefix. */
+	prefix_n = strlen(format);
+
+	/* If there isn't enough space for the ": " separator and at least one
+	 * character of the wrapped error message, then just print the prefix.
+	 */
+	if (prefix_n >= n - (WRAP_SEP_LEN + 1)) {
+/* We explicitly allow truncation here + silence clang about unknown
+ * warning-group "-Wformat-truncation" */
+#ifdef __GNUC__
+#ifndef __clang__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-truncation"
+#endif
+#endif
+		ErrMsgPrintf(e, "%s", format);
+#ifdef __GNUC__
+#ifndef __clang__
+#pragma GCC diagnostic pop
+#endif
+#endif
+		return;
+	}
+
+	/* Right-shift the wrapped message, to make room for the prefix. */
+	prefix_and_sep_n = prefix_n + WRAP_SEP_LEN;
+	trail_n = strnlen(e, n - prefix_and_sep_n - 1);
+	memmove(e + prefix_and_sep_n, e, trail_n);
+	e[prefix_and_sep_n + trail_n] = 0;
+
+	/* Print the prefix. */
+	ErrMsgPrintf(e, "%s", format);
+
+	/* Print the separator.
+	 *
+	 * Avoid using strncpy(e->msg + prefix_n, WRAP_SEP, WRAP_SEP_LEN) since
+	 * it generates a warning. */
+	for (i = 0; i < WRAP_SEP_LEN; i++) {
+		e[prefix_n + i] = WRAP_SEP[i];
+	}
+}
+
+#define ERR_CODE_TO_STRING_CASE(CODE, MSG) \
+	case CODE:                         \
+		return MSG;
+
+const char *errCodeToString(int code)
+{
+	switch (code) {
+		ERR_CODE_TO_STRING_MAP(ERR_CODE_TO_STRING_CASE);
+		default:
+			return "unknown error";
+	}
+}
diff --git a/src/raft/err.h b/src/raft/err.h
new file mode 100644
index 000000000..fb157ce90
--- /dev/null
+++ b/src/raft/err.h
@@ -0,0 +1,67 @@
+/* Utilities around error handling. */
+
+#ifndef ERROR_H_
+#define ERROR_H_
+
+#include <stddef.h>
+#include <string.h>
+
+#define ERR_CODE_TO_STRING_MAP(X)                                           \
+	X(RAFT_NOMEM, "out of memory")                                      \
+	X(RAFT_BADID, "server ID is not valid")                             \
+	X(RAFT_DUPLICATEID, "server ID already in use")                     \
+	X(RAFT_DUPLICATEADDRESS, "server address already in use")           \
+	X(RAFT_BADROLE, "server role is not valid")                         \
+	X(RAFT_MALFORMED, "encoded data is malformed")                      \
+	X(RAFT_NOTLEADER, "server is not the leader")                       \
+	X(RAFT_LEADERSHIPLOST, "server has lost leadership")                \
+	X(RAFT_SHUTDOWN, "server is shutting down")                         \
+	X(RAFT_CANTBOOTSTRAP, "bootstrap only works on new clusters")       \
+	X(RAFT_CANTCHANGE, "a configuration change is already in progress") \
+	X(RAFT_CORRUPT, "persisted data is corrupted")                      \
+	X(RAFT_CANCELED, "operation canceled")                              \
+	X(RAFT_NAMETOOLONG, "resource name too long")                       \
+	X(RAFT_TOOBIG, "data is too big")                                   \
+	X(RAFT_NOCONNECTION, "no connection to remote server available")    \
+	X(RAFT_BUSY, "operation can't be performed at this time")           \
+	X(RAFT_IOERR, "I/O error")                                          \
+	X(RAFT_NOTFOUND, "Resource not found")                              \
+	X(RAFT_INVALID, "Invalid parameter")                                \
+	X(RAFT_UNAUTHORIZED, "No access to resource")                       \
+	X(RAFT_NOSPACE, "Not enough disk space")                            \
+	X(RAFT_TOOMANY, "System or raft limit met or exceeded")
+
+/* Format an error message. */
+#define ErrMsgPrintf(ERRMSG, ...) \
+	snprintf(ERRMSG, RAFT_ERRMSG_BUF_SIZE, __VA_ARGS__)
+
+/* Wrap the given error message with an additional prefix message.. */
+#define ErrMsgWrapf(ERRMSG, ...)                    \
+	do {                                        \
+		char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \
+		ErrMsgPrintf(_errmsg, __VA_ARGS__); \
+		errMsgWrap(ERRMSG, _errmsg);        \
+	} while (0)
+
+void errMsgWrap(char *e, const char *format);
+
+/* Transfer an error message from an object to another, wrapping it. */
+#define ErrMsgTransfer(ERRMSG1, ERRMSG2, FORMAT)        \
+	memcpy(ERRMSG2, ERRMSG1, RAFT_ERRMSG_BUF_SIZE); \
+	ErrMsgWrapf(ERRMSG2, FORMAT)
+
+#define ErrMsgTransferf(ERRMSG1, ERRMSG2, FORMAT, ...)  \
+	memcpy(ERRMSG2, ERRMSG1, RAFT_ERRMSG_BUF_SIZE); \
+	ErrMsgWrapf(ERRMSG2, FORMAT, __VA_ARGS__)
+
+/* Use the static error message for the error with the given code. */
+#define ErrMsgFromCode(ERRMSG, CODE) \
+	ErrMsgPrintf(ERRMSG, "%s", errCodeToString(CODE))
+
+/* Format the out of memory error message. */
+#define ErrMsgOom(ERRMSG) ErrMsgFromCode(ERRMSG, RAFT_NOMEM)
+
+/* Convert a numeric raft error code to a human-readable error message. */
+const char *errCodeToString(int code);
+
+#endif /* ERROR_H_ */
diff --git a/src/raft/fixture.c b/src/raft/fixture.c
new file mode 100644
index 000000000..cef8cfc41
--- /dev/null
+++ b/src/raft/fixture.c
@@ -0,0 +1,1995 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../raft.h"
+#include "../tracing.h"
+#include "assert.h"
+#include "configuration.h"
+#include "convert.h"
+#include "entry.h"
+#include "log.h"
+#include "queue.h"
+#include "snapshot.h"
+
+/* Defaults */
+#define HEARTBEAT_TIMEOUT 100
+#define INSTALL_SNAPSHOT_TIMEOUT 30000
+#define ELECTION_TIMEOUT 1000
+#define NETWORK_LATENCY 15
+#define DISK_LATENCY 10
+#define WORK_DURATION 200
+#define SEND_LATENCY 0
+
+/* To keep in sync with raft.h */
+#define N_MESSAGE_TYPES 6
+
+/* Maximum number of peer stub instances connected to a certain stub
+ * instance. This should be enough for testing purposes. */
+#define MAX_PEERS 8
+
+struct raft_fixture_server
+{
+	bool alive;                /* If false, the server is down. */
+	raft_id id;                /* Server ID. */
+	char address[16];          /* Server address (stringified ID). */
+	struct raft_tracer tracer; /* Tracer. */
+	struct raft_io io;         /* In-memory raft_io implementation. */
+	struct raft raft;          /* Raft instance. */
+};
+
+struct raft_fixture_event
+{
+	unsigned server_index; /* Index of the server the event occurred on. */
+	int type;              /* Type of the event. */
+};
+
+RAFT_API int raft_fixture_event_type(struct raft_fixture_event *event)
+{
+	assert(event != NULL);
+	return event->type;
+}
+
+RAFT_API unsigned raft_fixture_event_server_index(
+    struct raft_fixture_event *event)
+{
+	assert(event != NULL);
+	return event->server_index;
+}
+
+/* Fields common across all request types. */
+#define REQUEST                                                                \
+	int type;                  /* Request code type. */                    \
+	raft_time completion_time; /* When the request should be fulfilled. */ \
+	queue queue                /* Link the I/O pending requests queue. */
+
+/* Request type codes. */
+enum { APPEND = 1, SEND, TRANSMIT, SNAPSHOT_PUT, SNAPSHOT_GET, ASYNC_WORK };
+
+/* Abstract base type for an asynchronous request submitted to the stub I/o
+ * implementation. */
+struct ioRequest
+{
+	REQUEST;
+};
+
+/* Pending request to append entries to the log. */
+struct append
+{
+	REQUEST;
+	struct raft_io_append *req;
+	const struct raft_entry *entries;
+	unsigned n;
+	unsigned start; /* Request timestamp. */
+};
+
+/* Pending request to send a message. */
+struct send
+{
+	REQUEST;
+	struct raft_io_send *req;
+	struct raft_message message;
+};
+
+/* Pending request to store a snapshot. */
+struct snapshot_put
+{
+	REQUEST;
+	unsigned trailing;
+	struct raft_io_snapshot_put *req;
+	const struct raft_snapshot *snapshot;
+};
+
+/* Pending request to perform general work. */
+struct async_work
+{
+	REQUEST;
+	struct raft_io_async_work *req;
+};
+
+/* Pending request to load a snapshot. */
+struct snapshot_get
+{
+	REQUEST;
+	struct raft_io_snapshot_get *req;
+};
+
+/* Message that has been written to the network and is waiting to be delivered
+ * (or discarded). */
+struct transmit
+{
+	REQUEST;
+	struct raft_message message; /* Message to deliver */
+	int timer;                   /* Deliver after this n of msecs. */
+};
+
+/* Information about a peer server. */
+struct peer
+{
+	struct io *io;  /* The peer's I/O backend. */
+	bool connected; /* Whether a connection is established. */
+	bool saturated; /* Whether the established connection is saturated. */
+	unsigned send_latency;
+};
+
+/* Stub I/O implementation implementing all operations in-memory. */
+struct io
+{
+	struct raft_io *io;  /* I/O object we're implementing. */
+	unsigned index;      /* Fixture server index. */
+	raft_time *time;     /* Global cluster time. */
+	raft_time next_tick; /* Time the next tick should occurs. */
+
+	/* Term and vote */
+	raft_term term;
+	raft_id voted_for;
+
+	/* Log */
+	struct raft_snapshot *snapshot; /* Latest snapshot */
+	struct raft_entry *entries;     /* Array or persisted entries */
+	size_t n; /* Size of the persisted entries array */
+
+	/* Parameters passed via raft_io->init and raft_io->start */
+	raft_id id;
+	const char *address;
+	unsigned tick_interval;
+	raft_io_tick_cb tick_cb;
+	raft_io_recv_cb recv_cb;
+
+	/* Queue of pending asynchronous requests, whose callbacks still haven't
+	 * been fired. */
+	queue requests;
+
+	/* Peers connected to us. */
+	struct peer peers[MAX_PEERS];
+	unsigned n_peers;
+
+	unsigned
+	    randomized_election_timeout; /* Value returned by io->random() */
+	unsigned network_latency;        /* Milliseconds to deliver RPCs */
+	unsigned disk_latency;           /* Milliseconds to perform disk I/O */
+	unsigned work_duration;          /* Milliseconds to run async work */
+
+	int append_fault_countdown;
+	int vote_fault_countdown;
+	int term_fault_countdown;
+	int send_fault_countdown;
+
+	/* If flag i is true, messages of type i will be silently dropped. */
+	bool drop[N_MESSAGE_TYPES];
+
+	/* Counters of events that happened so far. */
+	unsigned n_send[N_MESSAGE_TYPES];
+	unsigned n_recv[N_MESSAGE_TYPES];
+	unsigned n_append;
+};
+
+static bool faultTick(int *countdown)
+{
+	bool trigger = *countdown == 0;
+	if (*countdown >= 0) {
+		*countdown -= 1;
+	}
+	return trigger;
+}
+
+static int ioMethodInit(struct raft_io *raft_io,
+			raft_id id,
+			const char *address)
+{
+	struct io *io = raft_io->impl;
+	io->id = id;
+	io->address = address;
+	return 0;
+}
+
+static int ioMethodStart(struct raft_io *raft_io,
+			 unsigned msecs,
+			 raft_io_tick_cb tick_cb,
+			 raft_io_recv_cb recv_cb)
+{
+	struct io *io = raft_io->impl;
+	io->tick_interval = msecs;
+	io->tick_cb = tick_cb;
+	io->recv_cb = recv_cb;
+	io->next_tick = *io->time + io->tick_interval;
+	return 0;
+}
+
+/* Flush an append entries request, appending its entries to the local in-memory
+ * log. */
+static void ioFlushAppend(struct io *s, struct append *append)
+{
+	struct raft_entry *entries;
+	unsigned i;
+	int status = 0;
+
+	/* Simulates a disk write failure. */
+	if (faultTick(&s->append_fault_countdown)) {
+		status = RAFT_IOERR;
+		goto done;
+	}
+
+	/* Allocate an array for the old entries plus the new ones. */
+	entries =
+	    raft_realloc(s->entries, (s->n + append->n) * sizeof *s->entries);
+	assert(entries != NULL);
+
+	/* Copy new entries into the new array. */
+	for (i = 0; i < append->n; i++) {
+		const struct raft_entry *src = &append->entries[i];
+		struct raft_entry *dst = &entries[s->n + i];
+		int rv = entryCopy(src, dst);
+		assert(rv == 0);
+	}
+
+	s->entries = entries;
+	s->n += append->n;
+
+done:
+	if (append->req->cb != NULL) {
+		append->req->cb(append->req, status);
+	}
+	raft_free(append);
+}
+
+/* Flush a snapshot put request, copying the snapshot data. */
+static void ioFlushSnapshotPut(struct io *s, struct snapshot_put *r)
+{
+	int rv;
+
+	if (s->snapshot == NULL) {
+		s->snapshot = raft_malloc(sizeof *s->snapshot);
+		assert(s->snapshot != NULL);
+	} else {
+		snapshotClose(s->snapshot);
+	}
+
+	rv = snapshotCopy(r->snapshot, s->snapshot);
+	assert(rv == 0);
+
+	if (r->trailing == 0) {
+		rv = s->io->truncate(s->io, 1);
+		assert(rv == 0);
+	}
+
+	if (r->req->cb != NULL) {
+		r->req->cb(r->req, 0);
+	}
+	raft_free(r);
+}
+
+/* Flush a snapshot get request, returning to the client a copy of the local
+ * snapshot (if any). */
+static void ioFlushSnapshotGet(struct io *s, struct snapshot_get *r)
+{
+	struct raft_snapshot *snapshot;
+	int rv;
+	snapshot = raft_malloc(sizeof *snapshot);
+	assert(snapshot != NULL);
+	rv = snapshotCopy(s->snapshot, snapshot);
+	assert(rv == 0);
+	r->req->cb(r->req, snapshot, 0);
+	raft_free(r);
+}
+
+/* Flush an async work request */
+static void ioFlushAsyncWork(struct io *s, struct async_work *r)
+{
+	(void)s;
+	int rv;
+	rv = r->req->work(r->req);
+	r->req->cb(r->req, rv);
+	raft_free(r);
+}
+
+/* Search for the peer with the given ID. */
+static struct peer *ioGetPeer(struct io *io, raft_id id)
+{
+	unsigned i;
+	for (i = 0; i < io->n_peers; i++) {
+		struct peer *peer = &io->peers[i];
+		if (peer->io->id == id) {
+			return peer;
+		}
+	}
+	return NULL;
+}
+
+/* Copy the dynamically allocated memory of an AppendEntries message. */
+static void copyAppendEntries(const struct raft_append_entries *src,
+			      struct raft_append_entries *dst)
+{
+	int rv;
+	rv = entryBatchCopy(src->entries, &dst->entries, src->n_entries);
+	assert(rv == 0);
+	dst->n_entries = src->n_entries;
+}
+
+/* Copy the dynamically allocated memory of an InstallSnapshot message. */
+static void copyInstallSnapshot(const struct raft_install_snapshot *src,
+				struct raft_install_snapshot *dst)
+{
+	int rv;
+	rv = configurationCopy(&src->conf, &dst->conf);
+	assert(rv == 0);
+	dst->data.base = raft_malloc(dst->data.len);
+	assert(dst->data.base != NULL);
+	memcpy(dst->data.base, src->data.base, src->data.len);
+}
+
+/* Flush a raft_io_send request, copying the message content into a new struct
+ * transmit object and invoking the user callback. */
+static void ioFlushSend(struct io *io, struct send *send)
+{
+	struct peer *peer;
+	struct transmit *transmit;
+	struct raft_message *src;
+	struct raft_message *dst;
+	int status;
+
+	/* If the peer doesn't exist or was disconnected, fail the request. */
+	peer = ioGetPeer(io, send->message.server_id);
+	if (peer == NULL || !peer->connected) {
+		status = RAFT_NOCONNECTION;
+		goto out;
+	}
+
+	transmit = raft_calloc(1, sizeof *transmit);
+	assert(transmit != NULL);
+
+	transmit->type = TRANSMIT;
+	transmit->completion_time = *io->time + io->network_latency;
+
+	src = &send->message;
+	dst = &transmit->message;
+
+	QUEUE_PUSH(&io->requests, &transmit->queue);
+
+	*dst = *src;
+	switch (dst->type) {
+		case RAFT_IO_APPEND_ENTRIES:
+			/* Make a copy of the entries being sent */
+			copyAppendEntries(&src->append_entries,
+					  &dst->append_entries);
+			break;
+		case RAFT_IO_INSTALL_SNAPSHOT:
+			copyInstallSnapshot(&src->install_snapshot,
+					    &dst->install_snapshot);
+			break;
+	}
+
+	io->n_send[send->message.type]++;
+	status = 0;
+
+out:
+	if (send->req->cb != NULL) {
+		send->req->cb(send->req, status);
+	}
+
+	raft_free(send);
+}
+
+/* Release the memory used by the given message transmit object. */
+static void ioDestroyTransmit(struct transmit *transmit)
+{
+	struct raft_message *message;
+	message = &transmit->message;
+	switch (message->type) {
+		case RAFT_IO_APPEND_ENTRIES:
+			if (message->append_entries.entries != NULL) {
+				raft_free(
+				    message->append_entries.entries[0].batch);
+				raft_free(message->append_entries.entries);
+			}
+			break;
+		case RAFT_IO_INSTALL_SNAPSHOT:
+			raft_configuration_close(
+			    &message->install_snapshot.conf);
+			raft_free(message->install_snapshot.data.base);
+			break;
+	}
+	raft_free(transmit);
+}
+
+/* Flush all requests in the queue. */
+static void ioFlushAll(struct io *io)
+{
+	while (!QUEUE_IS_EMPTY(&io->requests)) {
+		queue *head;
+		struct ioRequest *r;
+
+		head = QUEUE_HEAD(&io->requests);
+		QUEUE_REMOVE(head);
+
+		r = QUEUE_DATA(head, struct ioRequest, queue);
+		switch (r->type) {
+			case APPEND:
+				ioFlushAppend(io, (struct append *)r);
+				break;
+			case SEND:
+				ioFlushSend(io, (struct send *)r);
+				break;
+			case TRANSMIT:
+				ioDestroyTransmit((struct transmit *)r);
+				break;
+			case SNAPSHOT_PUT:
+				ioFlushSnapshotPut(io,
+						   (struct snapshot_put *)r);
+				break;
+			case SNAPSHOT_GET:
+				ioFlushSnapshotGet(io,
+						   (struct snapshot_get *)r);
+				break;
+			case ASYNC_WORK:
+				ioFlushAsyncWork(io, (struct async_work *)r);
+				break;
+			default:
+				assert(0);
+		}
+	}
+}
+
+static void ioMethodClose(struct raft_io *raft_io, raft_io_close_cb cb)
+{
+	if (cb != NULL) {
+		cb(raft_io);
+	}
+}
+
+static int ioMethodLoad(struct raft_io *io,
+			raft_term *term,
+			raft_id *voted_for,
+			struct raft_snapshot **snapshot,
+			raft_index *start_index,
+			struct raft_entry **entries,
+			size_t *n_entries)
+{
+	struct io *s;
+	int rv;
+
+	s = io->impl;
+
+	*term = s->term;
+	*voted_for = s->voted_for;
+	*start_index = 1;
+
+	*n_entries = s->n;
+
+	/* Make a copy of the persisted entries, storing their data into a
+	 * single batch. */
+	rv = entryBatchCopy(s->entries, entries, s->n);
+	assert(rv == 0);
+
+	if (s->snapshot != NULL) {
+		*snapshot = raft_malloc(sizeof **snapshot);
+		assert(*snapshot != NULL);
+		rv = snapshotCopy(s->snapshot, *snapshot);
+		assert(rv == 0);
+		*start_index = (*snapshot)->index + 1;
+	} else {
+		*snapshot = NULL;
+	}
+
+	return 0;
+}
+
+static int ioMethodBootstrap(struct raft_io *raft_io,
+			     const struct raft_configuration *conf)
+{
+	struct io *io = raft_io->impl;
+	struct raft_buffer buf;
+	struct raft_entry *entries;
+	int rv;
+
+	if (io->term != 0) {
+		return RAFT_CANTBOOTSTRAP;
+	}
+
+	assert(io->voted_for == 0);
+	assert(io->snapshot == NULL);
+	assert(io->entries == NULL);
+	assert(io->n == 0);
+
+	/* Encode the given configuration. */
+	rv = configurationEncode(conf, &buf);
+	if (rv != 0) {
+		return rv;
+	}
+
+	entries = raft_calloc(1, sizeof *io->entries);
+	if (entries == NULL) {
+		return RAFT_NOMEM;
+	}
+
+	entries[0].term = 1;
+	entries[0].type = RAFT_CHANGE;
+	entries[0].buf = buf;
+
+	io->term = 1;
+	io->voted_for = 0;
+	io->snapshot = NULL;
+	io->entries = entries;
+	io->n = 1;
+
+	return 0;
+}
+
+static int ioMethodRecover(struct raft_io *io,
+			   const struct raft_configuration *conf)
+{
+	/* TODO: implement this API */
+	(void)io;
+	(void)conf;
+	return RAFT_IOERR;
+}
+
+static int ioMethodSetTerm(struct raft_io *raft_io, const raft_term term)
+{
+	struct io *io = raft_io->impl;
+
+	if (faultTick(&io->term_fault_countdown)) {
+		return RAFT_IOERR;
+	}
+
+	io->term = term;
+	io->voted_for = 0;
+
+	return 0;
+}
+
+static int ioMethodSetVote(struct raft_io *raft_io, const raft_id server_id)
+{
+	struct io *io = raft_io->impl;
+
+	if (faultTick(&io->vote_fault_countdown)) {
+		return RAFT_IOERR;
+	}
+
+	io->voted_for = server_id;
+
+	return 0;
+}
+
+static int ioMethodAppend(struct raft_io *raft_io,
+			  struct raft_io_append *req,
+			  const struct raft_entry entries[],
+			  unsigned n,
+			  raft_io_append_cb cb)
+{
+	struct io *io = raft_io->impl;
+	struct append *r;
+
+	r = raft_malloc(sizeof *r);
+	assert(r != NULL);
+
+	r->type = APPEND;
+	r->completion_time = *io->time + io->disk_latency;
+	r->req = req;
+	r->entries = entries;
+	r->n = n;
+
+	req->cb = cb;
+
+	QUEUE_PUSH(&io->requests, &r->queue);
+
+	return 0;
+}
+
+static int ioMethodTruncate(struct raft_io *raft_io, raft_index index)
+{
+	struct io *io = raft_io->impl;
+	size_t n;
+
+	n = (size_t)(index - 1); /* Number of entries left after truncation */
+
+	if (n > 0) {
+		struct raft_entry *entries;
+
+		/* Create a new array of entries holding the non-truncated
+		 * entries */
+		entries = raft_malloc(n * sizeof *entries);
+		if (entries == NULL) {
+			return RAFT_NOMEM;
+		}
+		memcpy(entries, io->entries, n * sizeof *io->entries);
+
+		/* Release any truncated entry */
+		if (io->entries != NULL) {
+			size_t i;
+			for (i = n; i < io->n; i++) {
+				raft_free(io->entries[i].buf.base);
+			}
+			raft_free(io->entries);
+		}
+		io->entries = entries;
+	} else {
+		/* Release everything we have */
+		if (io->entries != NULL) {
+			size_t i;
+			for (i = 0; i < io->n; i++) {
+				raft_free(io->entries[i].buf.base);
+			}
+			raft_free(io->entries);
+			io->entries = NULL;
+		}
+	}
+
+	io->n = n;
+
+	return 0;
+}
+
+static int ioMethodSnapshotPut(struct raft_io *raft_io,
+			       unsigned trailing,
+			       struct raft_io_snapshot_put *req,
+			       const struct raft_snapshot *snapshot,
+			       raft_io_snapshot_put_cb cb)
+{
+	struct io *io = raft_io->impl;
+	struct snapshot_put *r;
+
+	r = raft_malloc(sizeof *r);
+	assert(r != NULL);
+
+	r->type = SNAPSHOT_PUT;
+	r->req = req;
+	r->req->cb = cb;
+	r->snapshot = snapshot;
+	r->completion_time = *io->time + io->disk_latency;
+	r->trailing = trailing;
+
+	QUEUE_PUSH(&io->requests, &r->queue);
+
+	return 0;
+}
+
+static int ioMethodAsyncWork(struct raft_io *raft_io,
+			     struct raft_io_async_work *req,
+			     raft_io_async_work_cb cb)
+{
+	struct io *io = raft_io->impl;
+	struct async_work *r;
+
+	r = raft_malloc(sizeof *r);
+	assert(r != NULL);
+
+	r->type = ASYNC_WORK;
+	r->req = req;
+	r->req->cb = cb;
+	r->completion_time = *io->time + io->work_duration;
+
+	QUEUE_PUSH(&io->requests, &r->queue);
+	return 0;
+}
+
+static int ioMethodSnapshotGet(struct raft_io *raft_io,
+			       struct raft_io_snapshot_get *req,
+			       raft_io_snapshot_get_cb cb)
+{
+	struct io *io = raft_io->impl;
+	struct snapshot_get *r;
+
+	r = raft_malloc(sizeof *r);
+	assert(r != NULL);
+
+	r->type = SNAPSHOT_GET;
+	r->req = req;
+	r->req->cb = cb;
+	r->completion_time = *io->time + io->disk_latency;
+
+	QUEUE_PUSH(&io->requests, &r->queue);
+
+	return 0;
+}
+
+static raft_time ioMethodTime(struct raft_io *raft_io)
+{
+	struct io *io = raft_io->impl;
+	return *io->time;
+}
+
+static int ioMethodRandom(struct raft_io *raft_io, int min, int max)
+{
+	struct io *io = raft_io->impl;
+	int t = (int)io->randomized_election_timeout;
+	if (t < min) {
+		return min;
+	} else if (t > max) {
+		return max;
+	} else {
+		return t;
+	}
+}
+
+/* Queue up a request which will be processed later, when io_stub_flush()
+ * is invoked. */
+static int ioMethodSend(struct raft_io *raft_io,
+			struct raft_io_send *req,
+			const struct raft_message *message,
+			raft_io_send_cb cb)
+{
+	struct io *io = raft_io->impl;
+	struct send *r;
+	struct peer *peer;
+
+	if (faultTick(&io->send_fault_countdown)) {
+		return RAFT_IOERR;
+	}
+
+	r = raft_malloc(sizeof *r);
+	assert(r != NULL);
+
+	r->type = SEND;
+	r->req = req;
+	r->message = *message;
+	r->req->cb = cb;
+
+	peer = ioGetPeer(io, message->server_id);
+	r->completion_time = *io->time + peer->send_latency;
+
+	QUEUE_PUSH(&io->requests, &r->queue);
+
+	return 0;
+}
+
+static void ioReceive(struct io *io, struct raft_message *message)
+{
+	io->recv_cb(io->io, message);
+	io->n_recv[message->type]++;
+}
+
+static void ioDeliverTransmit(struct io *io, struct transmit *transmit)
+{
+	struct raft_message *message = &transmit->message;
+	struct peer *peer; /* Destination peer */
+
+	/* If this message type is in the drop list, let's discard it */
+	if (io->drop[message->type - 1]) {
+		ioDestroyTransmit(transmit);
+		return;
+	}
+
+	peer = ioGetPeer(io, message->server_id);
+
+	/* We don't have any peer with this ID or it's disconnected or if the
+	 * connection is saturated, let's drop the message */
+	if (peer == NULL || !peer->connected || peer->saturated) {
+		ioDestroyTransmit(transmit);
+		return;
+	}
+
+	/* Update the message object with our details. */
+	message->server_id = io->id;
+	message->server_address = io->address;
+
+	ioReceive(peer->io, message);
+	raft_free(transmit);
+}
+
+/* Connect @raft_io to @other, enabling delivery of messages sent from @io to
+ * @other.
+ */
+static void ioConnect(struct raft_io *raft_io, struct raft_io *other)
+{
+	struct io *io = raft_io->impl;
+	struct io *io_other = other->impl;
+	assert(io->n_peers < MAX_PEERS);
+	io->peers[io->n_peers].io = io_other;
+	io->peers[io->n_peers].connected = true;
+	io->peers[io->n_peers].saturated = false;
+	io->peers[io->n_peers].send_latency = SEND_LATENCY;
+	io->n_peers++;
+}
+
+/* Return whether the connection with the given peer is saturated. */
+static bool ioSaturated(struct raft_io *raft_io, struct raft_io *other)
+{
+	struct io *io = raft_io->impl;
+	struct io *io_other = other->impl;
+	struct peer *peer;
+	peer = ioGetPeer(io, io_other->id);
+	return peer != NULL && peer->saturated;
+}
+
+/* Disconnect @raft_io and @other, causing calls to @io->send() to fail
+ * asynchronously when sending messages to @other. */
+static void ioDisconnect(struct raft_io *raft_io, struct raft_io *other)
+{
+	struct io *io = raft_io->impl;
+	struct io *io_other = other->impl;
+	struct peer *peer;
+	peer = ioGetPeer(io, io_other->id);
+	assert(peer != NULL);
+	peer->connected = false;
+}
+
+/* Reconnect @raft_io and @other. */
+static void ioReconnect(struct raft_io *raft_io, struct raft_io *other)
+{
+	struct io *io = raft_io->impl;
+	struct io *io_other = other->impl;
+	struct peer *peer;
+	peer = ioGetPeer(io, io_other->id);
+	assert(peer != NULL);
+	peer->connected = true;
+}
+
+/* Saturate the connection from @io to @other, causing messages sent from @io to
+ * @other to be dropped. */
+static void ioSaturate(struct raft_io *io, struct raft_io *other)
+{
+	struct io *s;
+	struct io *s_other;
+	struct peer *peer;
+	s = io->impl;
+	s_other = other->impl;
+	peer = ioGetPeer(s, s_other->id);
+	assert(peer != NULL && peer->connected);
+	peer->saturated = true;
+}
+
+/* Desaturate the connection from @raft_io to @other, re-enabling delivery of
+ * messages sent from @raft_io to @other. */
+static void ioDesaturate(struct raft_io *raft_io, struct raft_io *other)
+{
+	struct io *io = raft_io->impl;
+	struct io *io_other = other->impl;
+	struct peer *peer;
+	peer = ioGetPeer(io, io_other->id);
+	assert(peer != NULL && peer->connected);
+	peer->saturated = false;
+}
+
+/* Enable or disable silently dropping all outgoing messages of type @type. */
+void ioDrop(struct io *io, int type, bool flag)
+{
+	io->drop[type - 1] = flag;
+}
+
+static int ioInit(struct raft_io *raft_io, unsigned index, raft_time *time)
+{
+	struct io *io;
+	io = raft_malloc(sizeof *io);
+	assert(io != NULL);
+	io->io = raft_io;
+	io->index = index;
+	io->time = time;
+	io->term = 0;
+	io->voted_for = 0;
+	io->snapshot = NULL;
+	io->entries = NULL;
+	io->n = 0;
+	QUEUE_INIT(&io->requests);
+	io->n_peers = 0;
+	io->randomized_election_timeout = ELECTION_TIMEOUT + index * 100;
+	io->network_latency = NETWORK_LATENCY;
+	io->disk_latency = DISK_LATENCY;
+	io->work_duration = WORK_DURATION;
+	io->append_fault_countdown = -1;
+	io->vote_fault_countdown = -1;
+	io->term_fault_countdown = -1;
+	io->send_fault_countdown = -1;
+	memset(io->drop, 0, sizeof io->drop);
+	memset(io->n_send, 0, sizeof io->n_send);
+	memset(io->n_recv, 0, sizeof io->n_recv);
+	io->n_append = 0;
+
+	raft_io->impl = io;
+	raft_io->version = 2;
+	raft_io->init = ioMethodInit;
+	raft_io->close = ioMethodClose;
+	raft_io->start = ioMethodStart;
+	raft_io->load = ioMethodLoad;
+	raft_io->bootstrap = ioMethodBootstrap;
+	raft_io->recover = ioMethodRecover;
+	raft_io->set_term = ioMethodSetTerm;
+	raft_io->set_vote = ioMethodSetVote;
+	raft_io->append = ioMethodAppend;
+	raft_io->truncate = ioMethodTruncate;
+	raft_io->send = ioMethodSend;
+	raft_io->snapshot_put = ioMethodSnapshotPut;
+	raft_io->async_work = ioMethodAsyncWork;
+	raft_io->snapshot_get = ioMethodSnapshotGet;
+	raft_io->time = ioMethodTime;
+	raft_io->random = ioMethodRandom;
+
+	return 0;
+}
+
+/* Release all memory held by the given stub I/O implementation. */
+void ioClose(struct raft_io *raft_io)
+{
+	struct io *io = raft_io->impl;
+	size_t i;
+	for (i = 0; i < io->n; i++) {
+		struct raft_entry *entry = &io->entries[i];
+		raft_free(entry->buf.base);
+	}
+	if (io->entries != NULL) {
+		raft_free(io->entries);
+	}
+	if (io->snapshot != NULL) {
+		snapshotClose(io->snapshot);
+		raft_free(io->snapshot);
+	}
+	raft_free(io);
+}
+
+/* Custom emit tracer function which include the server ID. */
+static void emit(struct raft_tracer *t,
+		 const char *file,
+		 unsigned int line,
+		 const char *func,
+		 unsigned int level,
+		 const char *message)
+{
+	unsigned id = *(unsigned *)t->impl;
+	(void)func;
+	(void)level;
+	fprintf(stderr, "%d: %30s:%*d - %s\n", id, file, 3, line, message);
+}
+
+static int serverInit(struct raft_fixture *f, unsigned i, struct raft_fsm *fsm)
+{
+	int rv;
+	struct raft_fixture_server *s;
+	s = raft_malloc(sizeof(*s));
+	if (s == NULL) {
+		return RAFT_NOMEM;
+	}
+	f->servers[i] = s;
+	s->alive = true;
+	s->id = i + 1;
+	sprintf(s->address, "%llu", s->id);
+	rv = ioInit(&s->io, i, &f->time);
+	if (rv != 0) {
+		return rv;
+	}
+	rv = raft_init(&s->raft, &s->io, fsm, s->id, s->address);
+	if (rv != 0) {
+		return rv;
+	}
+	raft_set_election_timeout(&s->raft, ELECTION_TIMEOUT);
+	raft_set_heartbeat_timeout(&s->raft, HEARTBEAT_TIMEOUT);
+	raft_set_install_snapshot_timeout(&s->raft, INSTALL_SNAPSHOT_TIMEOUT);
+	s->tracer.impl = (void *)&s->id;
+	s->tracer.emit = emit;
+	s->raft.tracer = NULL;
+	return 0;
+}
+
+static void serverClose(struct raft_fixture_server *s)
+{
+	raft_close(&s->raft, NULL);
+	ioClose(&s->io);
+	raft_free(s);
+}
+
+/* Connect the server with the given index to all others */
+static void serverConnectToAll(struct raft_fixture *f, unsigned i)
+{
+	unsigned j;
+	for (j = 0; j < f->n; j++) {
+		struct raft_io *io1 = &f->servers[i]->io;
+		struct raft_io *io2 = &f->servers[j]->io;
+		if (i == j) {
+			continue;
+		}
+		ioConnect(io1, io2);
+	}
+}
+
+int raft_fixture_init(struct raft_fixture *f)
+{
+	f->time = 0;
+	f->n = 0;
+	f->log = logInit();
+	if (f->log == NULL) {
+		return RAFT_NOMEM;
+	}
+	f->commit_index = 0;
+	f->hook = NULL;
+	f->event = raft_malloc(sizeof(*f->event));
+	if (f->event == NULL) {
+		return RAFT_NOMEM;
+	}
+	return 0;
+}
+
+void raft_fixture_close(struct raft_fixture *f)
+{
+	unsigned i;
+	for (i = 0; i < f->n; i++) {
+		struct io *io = f->servers[i]->io.impl;
+		ioFlushAll(io);
+	}
+	for (i = 0; i < f->n; i++) {
+		serverClose(f->servers[i]);
+	}
+	raft_free(f->event);
+	logClose(f->log);
+}
+
+int raft_fixture_configuration(struct raft_fixture *f,
+			       unsigned n_voting,
+			       struct raft_configuration *configuration)
+{
+	unsigned i;
+	assert(f->n > 0);
+	assert(n_voting > 0);
+	assert(n_voting <= f->n);
+	raft_configuration_init(configuration);
+	for (i = 0; i < f->n; i++) {
+		struct raft_fixture_server *s;
+		int role = i < n_voting ? RAFT_VOTER : RAFT_STANDBY;
+		int rv;
+		s = f->servers[i];
+		rv = raft_configuration_add(configuration, s->id, s->address,
+					    role);
+		if (rv != 0) {
+			return rv;
+		}
+	}
+	return 0;
+}
+
+int raft_fixture_bootstrap(struct raft_fixture *f,
+			   struct raft_configuration *configuration)
+{
+	unsigned i;
+	for (i = 0; i < f->n; i++) {
+		struct raft *raft = raft_fixture_get(f, i);
+		int rv;
+		rv = raft_bootstrap(raft, configuration);
+		if (rv != 0) {
+			return rv;
+		}
+	}
+	return 0;
+}
+
+int raft_fixture_start(struct raft_fixture *f)
+{
+	unsigned i;
+	int rv;
+	for (i = 0; i < f->n; i++) {
+		struct raft_fixture_server *s = f->servers[i];
+		rv = raft_start(&s->raft);
+		if (rv != 0) {
+			return rv;
+		}
+	}
+	return 0;
+}
+
+unsigned raft_fixture_n(struct raft_fixture *f)
+{
+	return f->n;
+}
+
+raft_time raft_fixture_time(struct raft_fixture *f)
+{
+	return f->time;
+}
+
+struct raft *raft_fixture_get(struct raft_fixture *f, unsigned i)
+{
+	assert(i < f->n);
+	return &f->servers[i]->raft;
+}
+
+bool raft_fixture_alive(struct raft_fixture *f, unsigned i)
+{
+	assert(i < f->n);
+	return f->servers[i]->alive;
+}
+
+unsigned raft_fixture_leader_index(struct raft_fixture *f)
+{
+	if (f->leader_id != 0) {
+		return (unsigned)(f->leader_id - 1);
+	}
+	return f->n;
+}
+
+raft_id raft_fixture_voted_for(struct raft_fixture *f, unsigned i)
+{
+	struct io *io = f->servers[i]->io.impl;
+	return io->voted_for;
+}
+
+/* Update the leader and check for election safety.
+ *
+ * From figure 3.2:
+ *
+ *   Election Safety -> At most one leader can be elected in a given
+ *   term.
+ *
+ * Return true if the current leader turns out to be different from the one at
+ * the time this function was called.
+ */
+static bool updateLeaderAndCheckElectionSafety(struct raft_fixture *f)
+{
+	raft_id leader_id = 0;
+	unsigned leader_i = 0;
+	raft_term leader_term = 0;
+	unsigned i;
+	bool changed;
+
+	for (i = 0; i < f->n; i++) {
+		struct raft *raft = raft_fixture_get(f, i);
+		unsigned j;
+
+		/* If the server is not alive or is not the leader, skip to the
+		 * next server. */
+		if (!raft_fixture_alive(f, i) ||
+		    raft_state(raft) != RAFT_LEADER) {
+			continue;
+		}
+
+		/* Check that no other server is leader for this term. */
+		for (j = 0; j < f->n; j++) {
+			struct raft *other = raft_fixture_get(f, j);
+
+			if (other->id == raft->id ||
+			    other->state != RAFT_LEADER) {
+				continue;
+			}
+
+			if (other->current_term == raft->current_term) {
+				fprintf(stderr,
+					"server %llu and %llu are both leaders "
+					"in term %llu",
+					raft->id, other->id,
+					raft->current_term);
+				abort();
+			}
+		}
+
+		if (raft->current_term > leader_term) {
+			leader_id = raft->id;
+			leader_i = i;
+			leader_term = raft->current_term;
+		}
+	}
+
+	/* Check that the leader is stable, in the sense that it has been
+	 * acknowledged by all alive servers connected to it, and those servers
+	 * together with the leader form a majority. */
+	if (leader_id != 0) {
+		unsigned n_acks = 0;
+		bool acked = true;
+		unsigned n_quorum = 0;
+
+		for (i = 0; i < f->n; i++) {
+			struct raft *raft = raft_fixture_get(f, i);
+			const struct raft_server *server =
+			    configurationGet(&raft->configuration, raft->id);
+
+			/* If the server is not in the configuration or is idle,
+			 * then don't count it. */
+			if (server == NULL || server->role == RAFT_SPARE) {
+				continue;
+			}
+
+			n_quorum++;
+
+			/* If this server is itself the leader, or it's not
+			 * alive or it's not connected to the leader, then don't
+			 * count it in for stability. */
+			if (i == leader_i || !raft_fixture_alive(f, i) ||
+			    raft_fixture_saturated(f, leader_i, i)) {
+				continue;
+			}
+
+			if (raft->current_term != leader_term) {
+				acked = false;
+				break;
+			}
+
+			if (raft->state != RAFT_FOLLOWER) {
+				acked = false;
+				break;
+			}
+
+			if (raft->follower_state.current_leader.id == 0) {
+				acked = false;
+				break;
+			}
+
+			if (raft->follower_state.current_leader.id !=
+			    leader_id) {
+				acked = false;
+				break;
+			}
+
+			n_acks++;
+		}
+
+		if (!acked || n_acks < (n_quorum / 2)) {
+			leader_id = 0;
+		}
+	}
+
+	changed = leader_id != f->leader_id;
+	f->leader_id = leader_id;
+
+	return changed;
+}
+
+/* Check for leader append-only.
+ *
+ * From figure 3.2:
+ *
+ *   Leader Append-Only -> A leader never overwrites or deletes entries in its
+ *   own log; it only appends new entries.
+ */
+static void checkLeaderAppendOnly(struct raft_fixture *f)
+{
+	struct raft *raft;
+	raft_index index;
+	raft_index last = logLastIndex(f->log);
+
+	/* If the cached log is empty it means there was no leader before. */
+	if (last == 0) {
+		return;
+	}
+
+	/* If there's no new leader, just return. */
+	if (f->leader_id == 0) {
+		return;
+	}
+
+	raft = raft_fixture_get(f, (unsigned)f->leader_id - 1);
+	last = logLastIndex(f->log);
+
+	for (index = 1; index <= last; index++) {
+		const struct raft_entry *entry1;
+		const struct raft_entry *entry2;
+		size_t i;
+
+		entry1 = logGet(f->log, index);
+		entry2 = logGet(raft->log, index);
+
+		assert(entry1 != NULL);
+
+		/* Check if the entry was snapshotted. */
+		if (entry2 == NULL) {
+			assert(raft->log->snapshot.last_index >= index);
+			continue;
+		}
+
+		/* Entry was not overwritten. */
+		assert(entry1->type == entry2->type);
+		assert(entry1->term == entry2->term);
+		for (i = 0; i < entry1->buf.len; i++) {
+			assert(((uint8_t *)entry1->buf.base)[i] ==
+			       ((uint8_t *)entry2->buf.base)[i]);
+		}
+	}
+}
+
+/* Make a copy of the the current leader log, in order to perform the Leader
+ * Append-Only check at the next iteration. */
+static void copyLeaderLog(struct raft_fixture *f)
+{
+	struct raft *raft = raft_fixture_get(f, (unsigned)f->leader_id - 1);
+	struct raft_entry *entries;
+	unsigned n;
+	size_t i;
+	int rv;
+	logClose(f->log);
+	f->log = logInit();
+	if (f->log == NULL) {
+		assert(false);
+		return;
+	}
+
+	rv = logAcquire(raft->log, 1, &entries, &n);
+	assert(rv == 0);
+	for (i = 0; i < n; i++) {
+		struct raft_entry *entry = &entries[i];
+		struct raft_buffer buf;
+		buf.len = entry->buf.len;
+		buf.base = raft_malloc(buf.len);
+		assert(buf.base != NULL);
+		memcpy(buf.base, entry->buf.base, buf.len);
+		rv = logAppend(f->log, entry->term, entry->type, &buf, NULL);
+		assert(rv == 0);
+	}
+	logRelease(raft->log, 1, entries, n);
+}
+
+/* Update the commit index to match the one from the current leader. */
+static void updateCommitIndex(struct raft_fixture *f)
+{
+	struct raft *raft = raft_fixture_get(f, (unsigned)f->leader_id - 1);
+	if (raft->commit_index > f->commit_index) {
+		f->commit_index = raft->commit_index;
+	}
+}
+
+/* Return the lowest tick time across all servers, along with the associated
+ * server index */
+static void getLowestTickTime(struct raft_fixture *f, raft_time *t, unsigned *i)
+{
+	unsigned j;
+	*t = (raft_time)-1 /* Maximum value */;
+	for (j = 0; j < f->n; j++) {
+		struct io *io = f->servers[j]->io.impl;
+		if (io->next_tick < *t) {
+			*t = io->next_tick;
+			*i = j;
+		}
+	}
+}
+
+/* Return the completion time of the request with the lowest completion time
+ * across all servers, along with the associated server index. */
+static void getLowestRequestCompletionTime(struct raft_fixture *f,
+					   raft_time *t,
+					   unsigned *i)
+{
+	unsigned j;
+	*t = (raft_time)-1 /* Maximum value */;
+	for (j = 0; j < f->n; j++) {
+		struct io *io = f->servers[j]->io.impl;
+		queue *head;
+		QUEUE_FOREACH(head, &io->requests)
+		{
+			struct ioRequest *r =
+			    QUEUE_DATA(head, struct ioRequest, queue);
+			if (r->completion_time < *t) {
+				*t = r->completion_time;
+				*i = j;
+			}
+		}
+	}
+}
+
+/* Fire the tick callback of the i'th server. */
+static void fireTick(struct raft_fixture *f, unsigned i)
+{
+	struct io *io = f->servers[i]->io.impl;
+	f->time = io->next_tick;
+	f->event->server_index = i;
+	f->event->type = RAFT_FIXTURE_TICK;
+	io->next_tick += io->tick_interval;
+	if (f->servers[i]->alive) {
+		io->tick_cb(io->io);
+	}
+}
+
+/* Complete the first request with completion time @t on the @i'th server. */
+static void completeRequest(struct raft_fixture *f, unsigned i, raft_time t)
+{
+	struct io *io = f->servers[i]->io.impl;
+	queue *head;
+	struct ioRequest *r = NULL;
+	bool found = false;
+	f->time = t;
+	f->event->server_index = i;
+	QUEUE_FOREACH(head, &io->requests)
+	{
+		r = QUEUE_DATA(head, struct ioRequest, queue);
+		if (r->completion_time == t) {
+			found = true;
+			break;
+		}
+	}
+	assert(found);
+	QUEUE_REMOVE(head);
+	switch (r->type) {
+		case APPEND:
+			ioFlushAppend(io, (struct append *)r);
+			f->event->type = RAFT_FIXTURE_DISK;
+			break;
+		case SEND:
+			ioFlushSend(io, (struct send *)r);
+			f->event->type = RAFT_FIXTURE_NETWORK;
+			break;
+		case TRANSMIT:
+			ioDeliverTransmit(io, (struct transmit *)r);
+			f->event->type = RAFT_FIXTURE_NETWORK;
+			break;
+		case SNAPSHOT_PUT:
+			ioFlushSnapshotPut(io, (struct snapshot_put *)r);
+			f->event->type = RAFT_FIXTURE_DISK;
+			break;
+		case SNAPSHOT_GET:
+			ioFlushSnapshotGet(io, (struct snapshot_get *)r);
+			f->event->type = RAFT_FIXTURE_DISK;
+			break;
+		case ASYNC_WORK:
+			ioFlushAsyncWork(io, (struct async_work *)r);
+			f->event->type = RAFT_FIXTURE_WORK;
+			break;
+		default:
+			assert(0);
+	}
+}
+
+struct raft_fixture_event *raft_fixture_step(struct raft_fixture *f)
+{
+	raft_time tick_time;
+	raft_time completion_time;
+	unsigned i = f->n;
+	unsigned j = f->n;
+
+	getLowestTickTime(f, &tick_time, &i);
+	getLowestRequestCompletionTime(f, &completion_time, &j);
+
+	assert(i < f->n || j < f->n);
+
+	if (tick_time < completion_time ||
+	    (tick_time == completion_time && i <= j)) {
+		fireTick(f, i);
+	} else {
+		completeRequest(f, j, completion_time);
+	}
+
+	/* If the leader has not changed check the Leader Append-Only
+	 * guarantee. */
+	if (!updateLeaderAndCheckElectionSafety(f)) {
+		checkLeaderAppendOnly(f);
+	}
+
+	/* If we have a leader, update leader-related state . */
+	if (f->leader_id != 0) {
+		copyLeaderLog(f);
+		updateCommitIndex(f);
+	}
+
+	if (f->hook != NULL) {
+		f->hook(f, f->event);
+	}
+
+	return f->event;
+}
+
+struct raft_fixture_event *raft_fixture_step_n(struct raft_fixture *f,
+					       unsigned n)
+{
+	unsigned i;
+	assert(n > 0);
+	for (i = 0; i < n - 1; i++) {
+		raft_fixture_step(f);
+	}
+	return raft_fixture_step(f);
+}
+
+bool raft_fixture_step_until(struct raft_fixture *f,
+			     bool (*stop)(struct raft_fixture *f, void *arg),
+			     void *arg,
+			     unsigned max_msecs)
+{
+	raft_time start = f->time;
+	while (!stop(f, arg) && (f->time - start) < max_msecs) {
+		raft_fixture_step(f);
+	}
+	return f->time - start < max_msecs;
+}
+
+/* A step function which return always false, forcing raft_fixture_step_n to
+ * advance time at each iteration. */
+static bool spin(struct raft_fixture *f, void *arg)
+{
+	(void)f;
+	(void)arg;
+	return false;
+}
+
+void raft_fixture_step_until_elapsed(struct raft_fixture *f, unsigned msecs)
+{
+	raft_fixture_step_until(f, spin, NULL, msecs);
+}
+
+static bool hasLeader(struct raft_fixture *f, void *arg)
+{
+	(void)arg;
+	return f->leader_id != 0;
+}
+
+bool raft_fixture_step_until_has_leader(struct raft_fixture *f,
+					unsigned max_msecs)
+{
+	return raft_fixture_step_until(f, hasLeader, NULL, max_msecs);
+}
+
+static bool hasNoLeader(struct raft_fixture *f, void *arg)
+{
+	(void)arg;
+	return f->leader_id == 0;
+}
+
+bool raft_fixture_step_until_has_no_leader(struct raft_fixture *f,
+					   unsigned max_msecs)
+{
+	return raft_fixture_step_until(f, hasNoLeader, NULL, max_msecs);
+}
+
+/* Enable/disable dropping outgoing messages of a certain type from all servers
+ * except one. */
+static void dropAllExcept(struct raft_fixture *f,
+			  int type,
+			  bool flag,
+			  unsigned i)
+{
+	unsigned j;
+	for (j = 0; j < f->n; j++) {
+		struct raft_fixture_server *s = f->servers[j];
+		if (j == i) {
+			continue;
+		}
+		ioDrop(s->io.impl, type, flag);
+	}
+}
+
+/* Set the randomized election timeout of the given server to the minimum value
+ * compatible with its current state and timers. */
+static void minimizeRandomizedElectionTimeout(struct raft_fixture *f,
+					      unsigned i)
+{
+	struct raft *raft = &f->servers[i]->raft;
+	raft_time now = raft->io->time(raft->io);
+	unsigned timeout = raft->election_timeout;
+	assert(raft->state == RAFT_FOLLOWER);
+
+	/* If the minimum election timeout value would make the timer expire in
+	 * the past, cap it. */
+	if (now - raft->election_timer_start > timeout) {
+		timeout = (unsigned)(now - raft->election_timer_start);
+	}
+
+	raft->follower_state.randomized_election_timeout = timeout;
+}
+
+/* Set the randomized election timeout to the maximum value on all servers
+ * except the given one. */
+static void maximizeAllRandomizedElectionTimeoutsExcept(struct raft_fixture *f,
+							unsigned i)
+{
+	unsigned j;
+	for (j = 0; j < f->n; j++) {
+		struct raft *raft = &f->servers[j]->raft;
+		unsigned timeout = raft->election_timeout * 2;
+		if (j == i) {
+			continue;
+		}
+		assert(raft->state == RAFT_FOLLOWER);
+		raft->follower_state.randomized_election_timeout = timeout;
+	}
+}
+
+void raft_fixture_hook(struct raft_fixture *f, raft_fixture_event_cb hook)
+{
+	f->hook = hook;
+}
+
+void raft_fixture_start_elect(struct raft_fixture *f, unsigned i)
+{
+	struct raft *raft = raft_fixture_get(f, i);
+	unsigned j;
+
+	/* Make sure there's currently no leader. */
+	assert(f->leader_id == 0);
+
+	/* Make sure that the given server is voting. */
+	assert(configurationGet(&raft->configuration, raft->id)->role ==
+	       RAFT_VOTER);
+
+	/* Make sure all servers are currently followers. */
+	for (j = 0; j < f->n; j++) {
+		assert(raft_state(&f->servers[j]->raft) == RAFT_FOLLOWER);
+	}
+
+	/* Pretend that the last randomized election timeout was set at the
+	 * maximum value on all server expect the one to be elected, which is
+	 * instead set to the minimum possible value compatible with its current
+	 * state. */
+	minimizeRandomizedElectionTimeout(f, i);
+	maximizeAllRandomizedElectionTimeoutsExcept(f, i);
+}
+
+void raft_fixture_elect(struct raft_fixture *f, unsigned i)
+{
+	struct raft *raft = raft_fixture_get(f, i);
+	raft_fixture_start_elect(f, i);
+	raft_fixture_step_until_has_leader(f, ELECTION_TIMEOUT * 20);
+	assert(f->leader_id == raft->id);
+}
+
+void raft_fixture_depose(struct raft_fixture *f)
+{
+	unsigned leader_i;
+
+	/* Make sure there's a leader. */
+	assert(f->leader_id != 0);
+	leader_i = (unsigned)f->leader_id - 1;
+	assert(raft_state(&f->servers[leader_i]->raft) == RAFT_LEADER);
+
+	/* Set a very large election timeout on all followers, to prevent them
+	 * from starting an election. */
+	maximizeAllRandomizedElectionTimeoutsExcept(f, leader_i);
+
+	/* Prevent all servers from sending append entries results, so the
+	 * leader will eventually step down. */
+	dropAllExcept(f, RAFT_IO_APPEND_ENTRIES_RESULT, true, leader_i);
+
+	raft_fixture_step_until_has_no_leader(f, ELECTION_TIMEOUT * 3);
+	assert(f->leader_id == 0);
+
+	dropAllExcept(f, RAFT_IO_APPEND_ENTRIES_RESULT, false, leader_i);
+}
+
+struct step_apply
+{
+	unsigned i;
+	raft_index index;
+};
+
+static bool hasAppliedIndex(struct raft_fixture *f, void *arg)
+{
+	struct step_apply *apply = (struct step_apply *)arg;
+	struct raft *raft;
+	unsigned n = 0;
+	unsigned i;
+
+	if (apply->i < f->n) {
+		raft = raft_fixture_get(f, apply->i);
+		return raft_last_applied(raft) >= apply->index;
+	}
+
+	for (i = 0; i < f->n; i++) {
+		raft = raft_fixture_get(f, i);
+		if (raft_last_applied(raft) >= apply->index) {
+			n++;
+		}
+	}
+	return n == f->n;
+}
+
+bool raft_fixture_step_until_applied(struct raft_fixture *f,
+				     unsigned i,
+				     raft_index index,
+				     unsigned max_msecs)
+{
+	struct step_apply apply = {i, index};
+	return raft_fixture_step_until(f, hasAppliedIndex, &apply, max_msecs);
+}
+
+struct step_state
+{
+	unsigned i;
+	int state;
+};
+
+static bool hasState(struct raft_fixture *f, void *arg)
+{
+	struct step_state *target = (struct step_state *)arg;
+	struct raft *raft;
+	raft = raft_fixture_get(f, target->i);
+	return raft_state(raft) == target->state;
+}
+
+bool raft_fixture_step_until_state_is(struct raft_fixture *f,
+				      unsigned i,
+				      int state,
+				      unsigned max_msecs)
+{
+	struct step_state target = {i, state};
+	return raft_fixture_step_until(f, hasState, &target, max_msecs);
+}
+
+struct step_term
+{
+	unsigned i;
+	raft_term term;
+};
+
+static bool hasTerm(struct raft_fixture *f, void *arg)
+{
+	struct step_term *target = (struct step_term *)arg;
+	struct raft *raft;
+	raft = raft_fixture_get(f, target->i);
+	return raft->current_term == target->term;
+}
+
+bool raft_fixture_step_until_term_is(struct raft_fixture *f,
+				     unsigned i,
+				     raft_term term,
+				     unsigned max_msecs)
+{
+	struct step_term target = {i, term};
+	return raft_fixture_step_until(f, hasTerm, &target, max_msecs);
+}
+
+struct step_vote
+{
+	unsigned i;
+	unsigned j;
+};
+
+static bool hasVotedFor(struct raft_fixture *f, void *arg)
+{
+	struct step_vote *target = (struct step_vote *)arg;
+	struct raft *raft;
+	raft = raft_fixture_get(f, target->i);
+	return raft->voted_for == target->j + 1;
+}
+
+bool raft_fixture_step_until_voted_for(struct raft_fixture *f,
+				       unsigned i,
+				       unsigned j,
+				       unsigned max_msecs)
+{
+	struct step_vote target = {i, j};
+	return raft_fixture_step_until(f, hasVotedFor, &target, max_msecs);
+}
+
+struct step_deliver
+{
+	unsigned i;
+	unsigned j;
+};
+
+static bool hasDelivered(struct raft_fixture *f, void *arg)
+{
+	struct step_deliver *target = (struct step_deliver *)arg;
+	struct raft *raft;
+	struct io *io;
+	struct raft_message *message;
+	queue *head;
+	raft = raft_fixture_get(f, target->i);
+	io = raft->io->impl;
+	QUEUE_FOREACH(head, &io->requests)
+	{
+		struct ioRequest *r;
+		r = QUEUE_DATA(head, struct ioRequest, queue);
+		message = NULL;
+		switch (r->type) {
+			case SEND:
+				message = &((struct send *)r)->message;
+				break;
+			case TRANSMIT:
+				message = &((struct transmit *)r)->message;
+				break;
+		}
+		if (message != NULL && message->server_id == target->j + 1) {
+			return false;
+		}
+	}
+	return true;
+}
+
+bool raft_fixture_step_until_delivered(struct raft_fixture *f,
+				       unsigned i,
+				       unsigned j,
+				       unsigned max_msecs)
+{
+	struct step_deliver target = {i, j};
+	return raft_fixture_step_until(f, hasDelivered, &target, max_msecs);
+}
+
+void raft_fixture_disconnect(struct raft_fixture *f, unsigned i, unsigned j)
+{
+	struct raft_io *io1 = &f->servers[i]->io;
+	struct raft_io *io2 = &f->servers[j]->io;
+	ioDisconnect(io1, io2);
+}
+
+void raft_fixture_reconnect(struct raft_fixture *f, unsigned i, unsigned j)
+{
+	struct raft_io *io1 = &f->servers[i]->io;
+	struct raft_io *io2 = &f->servers[j]->io;
+	ioReconnect(io1, io2);
+}
+
+void raft_fixture_saturate(struct raft_fixture *f, unsigned i, unsigned j)
+{
+	struct raft_io *io1 = &f->servers[i]->io;
+	struct raft_io *io2 = &f->servers[j]->io;
+	ioSaturate(io1, io2);
+}
+
+static void disconnectFromAll(struct raft_fixture *f, unsigned i)
+{
+	unsigned j;
+	for (j = 0; j < f->n; j++) {
+		if (j == i) {
+			continue;
+		}
+		raft_fixture_saturate(f, i, j);
+		raft_fixture_saturate(f, j, i);
+	}
+}
+
+static void reconnectToAll(struct raft_fixture *f, unsigned i)
+{
+	unsigned j;
+	for (j = 0; j < f->n; j++) {
+		if (j == i) {
+			continue;
+		}
+		/* Don't reconnect to disconnected peers */
+		if (!f->servers[j]->alive) {
+			continue;
+		}
+		raft_fixture_desaturate(f, i, j);
+		raft_fixture_desaturate(f, j, i);
+	}
+}
+
+bool raft_fixture_saturated(struct raft_fixture *f, unsigned i, unsigned j)
+{
+	struct raft_io *io1 = &f->servers[i]->io;
+	struct raft_io *io2 = &f->servers[j]->io;
+	return ioSaturated(io1, io2);
+}
+
+void raft_fixture_desaturate(struct raft_fixture *f, unsigned i, unsigned j)
+{
+	struct raft_io *io1 = &f->servers[i]->io;
+	struct raft_io *io2 = &f->servers[j]->io;
+	ioDesaturate(io1, io2);
+}
+
+void raft_fixture_kill(struct raft_fixture *f, unsigned i)
+{
+	disconnectFromAll(f, i);
+	f->servers[i]->alive = false;
+}
+
+void raft_fixture_revive(struct raft_fixture *f, unsigned i)
+{
+	reconnectToAll(f, i);
+	f->servers[i]->alive = true;
+}
+
+int raft_fixture_grow(struct raft_fixture *f, struct raft_fsm *fsm)
+{
+	unsigned i;
+	unsigned j;
+	int rc;
+	i = f->n;
+	f->n++;
+
+	rc = serverInit(f, i, fsm);
+	if (rc != 0) {
+		return rc;
+	}
+
+	serverConnectToAll(f, i);
+	for (j = 0; j < f->n; j++) {
+		struct raft_io *io1 = &f->servers[i]->io;
+		struct raft_io *io2 = &f->servers[j]->io;
+		ioConnect(io2, io1);
+	}
+
+	return 0;
+}
+
+void raft_fixture_set_randomized_election_timeout(struct raft_fixture *f,
+						  unsigned i,
+						  unsigned msecs)
+{
+	struct io *io = f->servers[i]->io.impl;
+	io->randomized_election_timeout = msecs;
+}
+
+void raft_fixture_set_network_latency(struct raft_fixture *f,
+				      unsigned i,
+				      unsigned msecs)
+{
+	struct io *io = f->servers[i]->io.impl;
+	io->network_latency = msecs;
+}
+
+void raft_fixture_set_disk_latency(struct raft_fixture *f,
+				   unsigned i,
+				   unsigned msecs)
+{
+	struct io *io = f->servers[i]->io.impl;
+	io->disk_latency = msecs;
+}
+
+void raft_fixture_set_send_latency(struct raft_fixture *f,
+				   unsigned i,
+				   unsigned j,
+				   unsigned msecs)
+{
+	struct io *io = f->servers[i]->io.impl;
+	struct peer *peer = ioGetPeer(io, f->servers[j]->id);
+	peer->send_latency = msecs;
+}
+
+void raft_fixture_set_term(struct raft_fixture *f, unsigned i, raft_term term)
+{
+	struct io *io = f->servers[i]->io.impl;
+	io->term = term;
+}
+
+void raft_fixture_set_snapshot(struct raft_fixture *f,
+			       unsigned i,
+			       struct raft_snapshot *snapshot)
+{
+	struct io *io = f->servers[i]->io.impl;
+	io->snapshot = snapshot;
+}
+
+void raft_fixture_add_entry(struct raft_fixture *f,
+			    unsigned i,
+			    struct raft_entry *entry)
+{
+	struct io *io = f->servers[i]->io.impl;
+	struct raft_entry *entries;
+	entries = raft_realloc(io->entries, (io->n + 1) * sizeof *entries);
+	assert(entries != NULL);
+	entries[io->n] = *entry;
+	io->entries = entries;
+	io->n++;
+}
+
+void raft_fixture_append_fault(struct raft_fixture *f, unsigned i, int delay)
+{
+	struct io *io = f->servers[i]->io.impl;
+	io->append_fault_countdown = delay;
+}
+
+void raft_fixture_vote_fault(struct raft_fixture *f, unsigned i, int delay)
+{
+	struct io *io = f->servers[i]->io.impl;
+	io->vote_fault_countdown = delay;
+}
+
+void raft_fixture_term_fault(struct raft_fixture *f, unsigned i, int delay)
+{
+	struct io *io = f->servers[i]->io.impl;
+	io->term_fault_countdown = delay;
+}
+
+void raft_fixture_send_fault(struct raft_fixture *f, unsigned i, int delay)
+{
+	struct io *io = f->servers[i]->io.impl;
+	io->send_fault_countdown = delay;
+}
+
+unsigned raft_fixture_n_send(struct raft_fixture *f, unsigned i, int type)
+{
+	struct io *io = f->servers[i]->io.impl;
+	return io->n_send[type];
+}
+
+unsigned raft_fixture_n_recv(struct raft_fixture *f, unsigned i, int type)
+{
+	struct io *io = f->servers[i]->io.impl;
+	return io->n_recv[type];
+}
+
+void raft_fixture_make_unavailable(struct raft_fixture *f, unsigned i)
+{
+	struct raft *r = &f->servers[i]->raft;
+	convertToUnavailable(r);
+}
diff --git a/src/raft/flags.c b/src/raft/flags.c
new file mode 100644
index 000000000..7247613ab
--- /dev/null
+++ b/src/raft/flags.c
@@ -0,0 +1,16 @@
+#include "flags.h"
+
+inline raft_flags flagsSet(raft_flags in, raft_flags flags)
+{
+	return in | flags;
+}
+
+inline raft_flags flagsClear(raft_flags in, raft_flags flags)
+{
+	return in & (~flags);
+}
+
+inline bool flagsIsSet(raft_flags in, raft_flags flag)
+{
+	return (bool)(in & flag);
+}
diff --git a/src/raft/flags.h b/src/raft/flags.h
new file mode 100644
index 000000000..79d2a8428
--- /dev/null
+++ b/src/raft/flags.h
@@ -0,0 +1,20 @@
+#ifndef FLAGS_H_
+#define FLAGS_H_
+
+#include "../raft.h"
+
+#define RAFT_DEFAULT_FEATURE_FLAGS (0)
+
+/* Adds the flags @flags to @in and returns the new flags. Multiple flags should
+ * be combined using the `|` operator. */
+raft_flags flagsSet(raft_flags in, raft_flags flags);
+
+/* Clears the flags @flags from @in and returns the new flags. Multiple flags
+ * should be combined using the `|` operator. */
+raft_flags flagsClear(raft_flags in, raft_flags flags);
+
+/* Returns `true` if the single flag @flag is set in @in, otherwise returns
+ * `false`. */
+bool flagsIsSet(raft_flags in, raft_flags flag);
+
+#endif /* FLAGS_H */
diff --git a/src/raft/heap.c b/src/raft/heap.c
new file mode 100644
index 000000000..9361cd12b
--- /dev/null
+++ b/src/raft/heap.c
@@ -0,0 +1,121 @@
+#include "heap.h"
+
+#include <stdlib.h>
+
+#include "../raft.h"
+
+static void *defaultMalloc(void *data, size_t size)
+{
+	(void)data;
+	return malloc(size);
+}
+
+static void defaultFree(void *data, void *ptr)
+{
+	(void)data;
+	free(ptr);
+}
+
+static void *defaultCalloc(void *data, size_t nmemb, size_t size)
+{
+	(void)data;
+	return calloc(nmemb, size);
+}
+
+static void *defaultRealloc(void *data, void *ptr, size_t size)
+{
+	(void)data;
+	return realloc(ptr, size);
+}
+
+static void *defaultAlignedAlloc(void *data, size_t alignment, size_t size)
+{
+	(void)data;
+	return aligned_alloc(alignment, size);
+}
+
+static void defaultAlignedFree(void *data, size_t alignment, void *ptr)
+{
+	(void)alignment;
+	defaultFree(data, ptr);
+}
+
+static struct raft_heap defaultHeap = {
+    NULL,                /* data */
+    defaultMalloc,       /* malloc */
+    defaultFree,         /* free */
+    defaultCalloc,       /* calloc */
+    defaultRealloc,      /* realloc */
+    defaultAlignedAlloc, /* aligned_alloc */
+    defaultAlignedFree   /* aligned_free */
+};
+
+static struct raft_heap *currentHeap = &defaultHeap;
+
+void *RaftHeapMalloc(size_t size)
+{
+	return currentHeap->malloc(currentHeap->data, size);
+}
+
+void RaftHeapFree(void *ptr)
+{
+	if (ptr == NULL) {
+		return;
+	}
+	currentHeap->free(currentHeap->data, ptr);
+}
+
+void *RaftHeapCalloc(size_t nmemb, size_t size)
+{
+	return currentHeap->calloc(currentHeap->data, nmemb, size);
+}
+
+void *RaftHeapRealloc(void *ptr, size_t size)
+{
+	return currentHeap->realloc(currentHeap->data, ptr, size);
+}
+
+void *raft_malloc(size_t size)
+{
+	return RaftHeapMalloc(size);
+}
+
+void raft_free(void *ptr)
+{
+	RaftHeapFree(ptr);
+}
+
+void *raft_calloc(size_t nmemb, size_t size)
+{
+	return RaftHeapCalloc(nmemb, size);
+}
+
+void *raft_realloc(void *ptr, size_t size)
+{
+	return RaftHeapRealloc(ptr, size);
+}
+
+void *raft_aligned_alloc(size_t alignment, size_t size)
+{
+	return currentHeap->aligned_alloc(currentHeap->data, alignment, size);
+}
+
+void raft_aligned_free(size_t alignment, void *ptr)
+{
+	currentHeap->aligned_free(currentHeap->data, alignment, ptr);
+}
+
+void raft_heap_set(struct raft_heap *heap)
+{
+	currentHeap = heap;
+}
+
+void raft_heap_set_default(void)
+{
+	currentHeap = &defaultHeap;
+}
+
+const struct raft_heap *raft_heap_get(void)
+{
+	return currentHeap;
+}
diff --git a/src/raft/heap.h b/src/raft/heap.h
new file mode 100644
index 000000000..005b5ea9c
--- /dev/null
+++ b/src/raft/heap.h
@@ -0,0 +1,16 @@
+/* Internal heap APIs. */
+
+#ifndef HEAP_H_
+#define HEAP_H_
+
+#include <stddef.h>
+
+void *RaftHeapMalloc(size_t size);
+
+void *RaftHeapCalloc(size_t nmemb, size_t size);
+
+void *RaftHeapRealloc(void *ptr, size_t size);
+
+void RaftHeapFree(void *ptr);
+
+#endif /* HEAP_H_ */
diff --git a/src/raft/lifecycle.c b/src/raft/lifecycle.c
new file mode 100644
index 000000000..bd6d618c7
--- /dev/null
+++ b/src/raft/lifecycle.c
@@ -0,0 +1,36 @@
+#include "lifecycle.h"
+#include "../tracing.h"
+#include "queue.h"
+
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+
+static bool reqIdIsSet(const struct request *req)
+{
+	return req->req_id[15] == (uint8_t)-1;
+}
+
+static uint64_t extractReqId(const struct request *req)
+{
+	uint64_t id;
+	memcpy(&id, &req->req_id, sizeof(id));
+	return id;
+}
+
+void lifecycleRequestStart(struct raft *r, struct request *req)
+{
+	if (reqIdIsSet(req)) {
+		tracef("request start id:%" PRIu64, extractReqId(req));
+	}
+	QUEUE_PUSH(&r->leader_state.requests, &req->queue);
+}
+
+void lifecycleRequestEnd(struct raft *r, struct request *req)
+{
+	(void)r;
+	if (reqIdIsSet(req)) {
+		tracef("request end id:%" PRIu64, extractReqId(req));
+	}
+	QUEUE_REMOVE(&req->queue);
+}
diff --git a/src/raft/lifecycle.h b/src/raft/lifecycle.h
new file mode 100644
index 000000000..616a260a0
--- /dev/null
+++ b/src/raft/lifecycle.h
@@ -0,0 +1,10 @@
+#ifndef LIFECYCLE_H_
+#define LIFECYCLE_H_
+
+#include "../raft.h"
+#include "request.h"
+
+void lifecycleRequestStart(struct raft *r, struct request *req);
+void lifecycleRequestEnd(struct raft *r, struct request *req);
+
+#endif
diff --git a/src/raft/log.c b/src/raft/log.c
new file mode 100644
index 000000000..434ad952f
--- /dev/null
+++ b/src/raft/log.c
@@ -0,0 +1,996 @@
+#include "log.h"
+
+#include <string.h>
+
+#include "../raft.h"
+#include "assert.h"
+#include "configuration.h"
+
+/* Calculate the reference count hash table key for the given log entry index in
+ * an hash table of the given size.
+ *
+ * The hash is simply the log entry index minus one modulo the size. This
+ * minimizes conflicts in the most frequent case, where a new log entry is
+ * simply appended to the log and can use the hash table bucket next to the
+ * bucket for the entry with the previous index (possibly resizing the table if
+ * its cap is reached). */
+static size_t refsKey(const raft_index index, const size_t size)
+{
+	assert(index > 0);
+	assert(size > 0);
+	return (size_t)((index - 1) % size);
+}
+
+/* Try to insert a new reference count item for the given log entry index into
+ * the given reference count hash table.
+ *
+ * A collision happens when the bucket associated with the hash key of the given
+ * log entry index is already used to refcount log entries with a different
+ * index. In that case the collision output parameter will be set to true and no
+ * new reference count item is inserted into the hash table.
+ *
+ * If two log entries have the same index but different terms, the associated
+ * bucket will be grown accordingly. */
+static int refsTryInsert(struct raft_entry_ref *table,
+			 const size_t size,
+			 const raft_term term,
+			 const raft_index index,
+			 const unsigned short count,
+			 struct raft_buffer buf,
+			 void *batch,
+			 bool *collision)
+{
+	struct raft_entry_ref *bucket; /* Bucket associated with this index. */
+	struct raft_entry_ref *next_slot; /* For traversing the bucket slots. */
+	struct raft_entry_ref
+	    *last_slot;              /* To track the last traversed slot. */
+	struct raft_entry_ref *slot; /* Actual slot to use for this entry. */
+	size_t key;
+
+	assert(table != NULL);
+	assert(size > 0);
+	assert(term > 0);
+	assert(index > 0);
+	assert(count > 0);
+	assert(collision != NULL);
+
+	/* Calculate the hash table key for the given index. */
+	key = refsKey(index, size);
+	bucket = &table[key];
+
+	/* If a bucket is empty, then there's no collision and we can fill its
+	 * first slot. */
+	if (bucket->count == 0) {
+		assert(bucket->next == NULL);
+		slot = bucket;
+		goto fill;
+	}
+
+	/* If the bucket is already used to refcount entries with a different
+	 * index, then we have a collision and we must abort here. */
+	if (bucket->index != index) {
+		*collision = true;
+		return 0;
+	}
+
+	/* If we get here it means that the bucket is in use to refcount one or
+	 * more entries with the same index as the given one, but different
+	 * terms.
+	 *
+	 * We must append a newly allocated slot to refcount the entry with this
+	 * term.
+	 *
+	 * So first let's find the last slot in the bucket. */
+	for (next_slot = bucket; next_slot != NULL;
+	     next_slot = next_slot->next) {
+		/* All entries in a bucket must have the same index. */
+		assert(next_slot->index == index);
+
+		/* It should never happen that two entries with the same index
+		 * and term get appended. So no existing slot in this bucket
+		 * must track an entry with the same term as the given one. */
+		assert(next_slot->term != term);
+
+		last_slot = next_slot;
+	}
+
+	/* The last slot must have no next slot. */
+	assert(last_slot->next == NULL);
+
+	slot = raft_malloc(sizeof *slot);
+	if (slot == NULL) {
+		return RAFT_NOMEM;
+	}
+
+	last_slot->next = slot;
+
+fill:
+	slot->term = term;
+	slot->index = index;
+	slot->count = count;
+	slot->buf = buf;
+	slot->batch = batch;
+	slot->next = NULL;
+
+	*collision = false;
+
+	return 0;
+}
+
+/* Move the slots of the given bucket into the given reference count hash
+ * table. The key of the bucket to use in the given table will be re-calculated
+ * according to the given size. */
+static int refsMove(struct raft_entry_ref *bucket,
+		    struct raft_entry_ref *table,
+		    const size_t size)
+{
+	struct raft_entry_ref *slot;
+	struct raft_entry_ref *next_slot;
+
+	assert(bucket != NULL);
+	assert(table != NULL);
+	assert(size > 0);
+
+	/* Only non-empty buckets should be moved. */
+	assert(bucket->count > 0);
+
+	/* For each slot in the bucket, insert the relevant entry in the given
+	 * table, then free it. */
+	next_slot = bucket;
+	while (next_slot != NULL) {
+		bool collision;
+		int rv;
+
+		slot = next_slot;
+
+		/* Insert the reference count for this entry into the new table.
+		 */
+		rv = refsTryInsert(table, size, slot->term, slot->index,
+				   slot->count, slot->buf, slot->batch,
+				   &collision);
+
+		next_slot = slot->next;
+
+		/* Unless this is the very first slot in the bucket, we need to
+		 * free the slot. */
+		if (slot != bucket) {
+			raft_free(slot);
+		}
+
+		if (rv != 0) {
+			return rv;
+		}
+
+		/* The given hash table is assumed to be large enough to hold
+		 * all ref counts without any conflict. */
+		assert(!collision);
+	};
+
+	return 0;
+}
+
+/* Grow the size of the reference count hash table. */
+static int refsGrow(struct raft_log *l)
+{
+	struct raft_entry_ref *table; /* New hash table. */
+	size_t size;                  /* Size of the new hash table. */
+	size_t i;
+
+	assert(l != NULL);
+	assert(l->refs_size > 0);
+
+	size = l->refs_size * 2; /* Double the table size */
+
+	table = raft_calloc(size, sizeof *table);
+	if (table == NULL) {
+		return RAFT_NOMEM;
+	}
+
+	/* Populate the new hash table, inserting all entries existing in the
+	 * current hash table. Each bucket will have a different key in the new
+	 * hash table, since the size has changed. */
+	for (i = 0; i < l->refs_size; i++) {
+		struct raft_entry_ref *bucket = &l->refs[i];
+		if (bucket->count > 0) {
+			int rv = refsMove(bucket, table, size);
+			if (rv != 0) {
+				return rv;
+			}
+		} else {
+			/* If the count is zero, we expect that the bucket is
+			 * unused. */
+			assert(bucket->next == NULL);
+		}
+	}
+
+	raft_free(l->refs);
+
+	l->refs = table;
+	l->refs_size = size;
+
+	return 0;
+}
+
+/* Initialize the reference count of the entry with the given index, setting it
+ * to 1. */
+static int refsInit(struct raft_log *l,
+		    const raft_term term,
+		    const raft_index index,
+		    struct raft_buffer buf,
+		    void *batch)
+{
+	int i;
+
+	assert(l != NULL);
+	assert(term > 0);
+	assert(index > 0);
+
+	/* Initialize the hash map with a reasonable size */
+	if (l->refs == NULL) {
+		l->refs_size = LOG__REFS_INITIAL_SIZE;
+		l->refs = raft_calloc(l->refs_size, sizeof *l->refs);
+		if (l->refs == NULL) {
+			return RAFT_NOMEM;
+		}
+	}
+
+	/* Check if the bucket associated with the given index is available
+	 * (i.e. there are no collisions), or grow the table and re-key it
+	 * otherwise.
+	 *
+	 * We limit the number of times we try to grow the table to 10, to avoid
+	 * eating up too much memory. In practice, there should never be a case
+	 * where this is not enough. */
+	for (i = 0; i < 10; i++) {
+		bool collision;
+		int rc;
+
+		rc = refsTryInsert(l->refs, l->refs_size, term, index, 1, buf,
+				   batch, &collision);
+		if (rc != 0) {
+			return RAFT_NOMEM;
+		}
+
+		if (!collision) {
+			return 0;
+		}
+
+		rc = refsGrow(l);
+		if (rc != 0) {
+			return rc;
+		}
+	};
+
+	return RAFT_NOMEM;
+}
+
+/* Increment the refcount of the entry with the given term and index. */
+static void refsIncr(struct raft_log *l,
+		     const raft_term term,
+		     const raft_index index)
+{
+	size_t key;                  /* Hash table key for the given index. */
+	struct raft_entry_ref *slot; /* Slot for the given term/index */
+
+	assert(l != NULL);
+	assert(term > 0);
+	assert(index > 0);
+
+	key = refsKey(index, l->refs_size);
+
+	/* Lookup the slot associated with the given term/index, which must have
+	 * been previously inserted. */
+	slot = &l->refs[key];
+	while (1) {
+		assert(slot != NULL);
+		assert(slot->index == index);
+		if (slot->term == term) {
+			break;
+		}
+		slot = slot->next;
+	}
+	assert(slot != NULL);
+
+	slot->count++;
+}
+
+/* Decrement the refcount of the entry with the given index. Return a boolean
+ * indicating whether the entry has now zero references. */
+static bool refsDecr(struct raft_log *l,
+		     const raft_term term,
+		     const raft_index index)
+{
+	size_t key;                  /* Hash table key for the given index. */
+	struct raft_entry_ref *slot; /* Slot for the given term/index */
+	struct raft_entry_ref
+	    *prev_slot; /* Slot preceeding the one to decrement */
+
+	assert(l != NULL);
+	assert(term > 0);
+	assert(index > 0);
+
+	key = refsKey(index, l->refs_size);
+	prev_slot = NULL;
+
+	/* Lookup the slot associated with the given term/index, keeping track
+	 * of its previous slot in the bucket list. */
+	slot = &l->refs[key];
+	while (1) {
+		assert(slot != NULL);
+		assert(slot->index == index);
+		if (slot->term == term) {
+			break;
+		}
+		prev_slot = slot;
+		slot = slot->next;
+	}
+
+	slot->count--;
+
+	if (slot->count > 0) {
+		/* The entry is still referenced. */
+		return false;
+	}
+
+	/* If the refcount has dropped to zero, delete the slot. */
+	if (prev_slot != NULL) {
+		/* This isn't the very first slot, simply unlink it from the
+		 * slot list. */
+		prev_slot->next = slot->next;
+		raft_free(slot);
+	} else if (slot->next != NULL) {
+		/* This is the very first slot, and slot list is not empty. Copy
+		 * the second slot into the first one, then delete it. */
+		struct raft_entry_ref *second_slot = slot->next;
+		*slot = *second_slot;
+		raft_free(second_slot);
+	}
+
+	return true;
+}
+
+struct raft_log *logInit(void)
+{
+	struct raft_log *log;
+
+	log = raft_malloc(sizeof(*log));
+	if (log == NULL) {
+		return NULL;
+	}
+
+	log->entries = NULL;
+	log->size = 0;
+	log->front = log->back = 0;
+	log->offset = 0;
+	log->refs = NULL;
+	log->refs_size = 0;
+	log->snapshot.last_index = 0;
+	log->snapshot.last_term = 0;
+
+	return log;
+}
+
+/* Return the index of the i'th entry in the log. */
+static raft_index indexAt(struct raft_log *l, size_t i)
+{
+	return l->offset + i + 1;
+}
+
+/* Return the circular buffer position of the i'th entry in the log. */
+static size_t positionAt(struct raft_log *l, size_t i)
+{
+	return (l->front + i) % l->size;
+}
+
+/* Return the i'th entry in the log. */
+static struct raft_entry *entryAt(struct raft_log *l, size_t i)
+{
+	return &l->entries[positionAt(l, i)];
+}
+
+void logClose(struct raft_log *l)
+{
+	void *batch = NULL; /* Last batch that has been freed */
+
+	assert(l != NULL);
+
+	if (l->entries != NULL) {
+		size_t i;
+		size_t n = logNumEntries(l);
+
+		for (i = 0; i < n; i++) {
+			struct raft_entry *entry = entryAt(l, i);
+			raft_index index = indexAt(l, i);
+			size_t key = refsKey(index, l->refs_size);
+			struct raft_entry_ref *slot = &l->refs[key];
+
+			/* We require that there are no outstanding references
+			 * to active entries. */
+			assert(slot->count == 1);
+
+			/* TODO: we should support the case where the bucket has
+			 * more than one slot. */
+			assert(slot->next == NULL);
+
+			/* Release the memory used by the entry data (either
+			 * directly or via a batch). */
+			if (entry->batch == NULL) {
+				if (entry->buf.base != NULL) {
+					raft_free(entry->buf.base);
+				}
+			} else {
+				if (entry->batch != batch) {
+					/* This batch was not released yet, so
+					 * let's do it now. */
+					batch = entry->batch;
+					raft_free(entry->batch);
+				}
+			}
+		}
+
+		raft_free(l->entries);
+	}
+
+	if (l->refs != NULL) {
+		raft_free(l->refs);
+	}
+
+	raft_free(l);
+}
+
+void logStart(struct raft_log *l,
+	      raft_index snapshot_index,
+	      raft_term snapshot_term,
+	      raft_index start_index)
+{
+	assert(logNumEntries(l) == 0);
+	assert(start_index > 0);
+	assert(start_index <= snapshot_index + 1);
+	assert(snapshot_index == 0 || snapshot_term != 0);
+	l->snapshot.last_index = snapshot_index;
+	l->snapshot.last_term = snapshot_term;
+	l->offset = start_index - 1;
+}
+
+/* Ensure that the entries array has enough free slots for adding a new entry.
+ */
+static int ensureCapacity(struct raft_log *l)
+{
+	struct raft_entry *entries; /* New entries array */
+	size_t n;                   /* Current number of entries */
+	size_t size;                /* Size of the new array */
+	size_t i;
+
+	n = logNumEntries(l);
+
+	if (n + 1 < l->size) {
+		return 0;
+	}
+
+	/* Make the new size twice the current size plus one (for the new
+	 * entry). Over-allocating now avoids smaller allocations later. */
+	size = (l->size + 1) * 2;
+
+	entries = raft_calloc(size, sizeof *entries);
+	if (entries == NULL) {
+		return RAFT_NOMEM;
+	}
+
+	/* Copy all active old entries to the beginning of the newly allocated
+	 * array. */
+	for (i = 0; i < n; i++) {
+		memcpy(&entries[i], entryAt(l, i), sizeof *entries);
+	}
+
+	/* Release the old entries array. */
+	if (l->entries != NULL) {
+		raft_free(l->entries);
+	}
+
+	l->entries = entries;
+	l->size = size;
+	l->front = 0;
+	l->back = n;
+
+	return 0;
+}
+
+int logReinstate(struct raft_log *l,
+		 raft_term term,
+		 unsigned short type,
+		 bool *reinstated)
+{
+	raft_index index;
+	size_t key;
+	struct raft_entry_ref *bucket;
+	struct raft_entry_ref *slot;
+	struct raft_entry *entry;
+	int rv;
+
+	*reinstated = false;
+
+	if (l->refs_size == 0) {
+		return 0;
+	}
+
+	index = logLastIndex(l) + 1;
+	key = refsKey(index, l->refs_size);
+	bucket = &l->refs[key];
+	if (bucket->count == 0 || bucket->index != index) {
+		return 0;
+	}
+
+	for (slot = bucket; slot != NULL; slot = slot->next) {
+		if (slot->term == term) {
+			rv = ensureCapacity(l);
+			if (rv != 0) {
+				return rv;
+			}
+			slot->count++;
+			l->back++;
+			l->back %= l->size;
+			entry = &l->entries[l->back];
+			entry->term = term;
+			entry->type = type;
+			entry->buf = slot->buf;
+			entry->batch = slot->batch;
+			*reinstated = true;
+			break;
+		}
+	}
+
+	return 0;
+}
+
+int logAppend(struct raft_log *l,
+	      const raft_term term,
+	      const unsigned short type,
+	      const struct raft_buffer *buf,
+	      void *batch)
+{
+	int rv;
+	struct raft_entry *entry;
+	raft_index index;
+
+	assert(l != NULL);
+	assert(term > 0);
+	assert(type == RAFT_CHANGE || type == RAFT_BARRIER ||
+	       type == RAFT_COMMAND);
+	assert(buf != NULL);
+
+	rv = ensureCapacity(l);
+	if (rv != 0) {
+		return rv;
+	}
+
+	index = logLastIndex(l) + 1;
+
+	rv = refsInit(l, term, index, *buf, batch);
+	if (rv != 0) {
+		return rv;
+	}
+
+	entry = &l->entries[l->back];
+	entry->term = term;
+	entry->type = type;
+	entry->buf = *buf;
+	entry->batch = batch;
+
+	l->back += 1;
+	l->back = l->back % l->size;
+
+	return 0;
+}
+
+int logAppendCommands(struct raft_log *l,
+		      const raft_term term,
+		      const struct raft_buffer bufs[],
+		      const unsigned n)
+{
+	unsigned i;
+	int rv;
+
+	assert(l != NULL);
+	assert(term > 0);
+	assert(bufs != NULL);
+	assert(n > 0);
+
+	for (i = 0; i < n; i++) {
+		const struct raft_buffer *buf = &bufs[i];
+		rv = logAppend(l, term, RAFT_COMMAND, buf, NULL);
+		if (rv != 0) {
+			return rv;
+		}
+	}
+
+	return 0;
+}
+
+int logAppendConfiguration(struct raft_log *l,
+			   const raft_term term,
+			   const struct raft_configuration *configuration)
+{
+	struct raft_buffer buf;
+	int rv;
+
+	assert(l != NULL);
+	assert(term > 0);
+	assert(configuration != NULL);
+
+	/* Encode the configuration into a buffer. */
+	rv = configurationEncode(configuration, &buf);
+	if (rv != 0) {
+		goto err;
+	}
+
+	/* Append the new entry to the log. */
+	rv = logAppend(l, term, RAFT_CHANGE, &buf, NULL);
+	if (rv != 0) {
+		goto err_after_encode;
+	}
+
+	return 0;
+
+err_after_encode:
+	raft_free(buf.base);
+
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+size_t logNumEntries(struct raft_log *l)
+{
+	assert(l != NULL);
+
+	/* The circular buffer is not wrapped. */
+	if (l->front <= l->back) {
+		return l->back - l->front;
+	}
+
+	/* The circular buffer is wrapped. */
+	return l->size - l->front + l->back;
+}
+
+raft_index logLastIndex(struct raft_log *l)
+{
+	/* If there are no entries in the log, but there is a snapshot available
+	 * check that it's last index is consistent with the offset. */
+	if (logNumEntries(l) == 0 && l->snapshot.last_index != 0) {
+		assert(l->offset <= l->snapshot.last_index);
+	}
+	return l->offset + logNumEntries(l);
+}
+
+/* Return the position of the entry with the given index in the entries array.
+ *
+ * If no entry with the given index is in the log return the size of the entries
+ * array. */
+static size_t locateEntry(struct raft_log *l, const raft_index index)
+{
+	size_t n = logNumEntries(l);
+
+	if (n == 0 || index < indexAt(l, 0) || index > indexAt(l, n - 1)) {
+		return l->size;
+	}
+
+	/* Get the circular buffer position of the desired entry. Log indexes
+	 * start at 1, so we subtract one to get array indexes. We also need to
+	 * subtract any index offset this log might start at. */
+	return positionAt(l, (size_t)((index - 1) - l->offset));
+}
+
+raft_term logTermOf(struct raft_log *l, const raft_index index)
+{
+	size_t i;
+	assert(index > 0);
+	assert(l->offset <= l->snapshot.last_index);
+
+	if ((index < l->offset + 1 && index != l->snapshot.last_index) ||
+	    index > logLastIndex(l)) {
+		return 0;
+	}
+
+	if (index == l->snapshot.last_index) {
+		assert(l->snapshot.last_term != 0);
+		/* Coherence check that if we still have the entry at
+		 * last_index, its term matches the one in the snapshot. */
+		i = locateEntry(l, index);
+		if (i != l->size) {
+			assert(l->entries[i].term == l->snapshot.last_term);
+		}
+		return l->snapshot.last_term;
+	}
+
+	i = locateEntry(l, index);
+	assert(i < l->size);
+	return l->entries[i].term;
+}
+
+raft_index logSnapshotIndex(struct raft_log *l)
+{
+	return l->snapshot.last_index;
+}
+
+raft_term logLastTerm(struct raft_log *l)
+{
+	raft_index last_index;
+	last_index = logLastIndex(l);
+	return last_index > 0 ? logTermOf(l, last_index) : 0;
+}
+
+const struct raft_entry *logGet(struct raft_log *l, const raft_index index)
+{
+	size_t i;
+
+	assert(l != NULL);
+
+	/* Get the array index of the desired entry. */
+	i = locateEntry(l, index);
+	if (i == l->size) {
+		return NULL;
+	}
+
+	assert(i < l->size);
+
+	return &l->entries[i];
+}
+
+int logAcquire(struct raft_log *l,
+	       const raft_index index,
+	       struct raft_entry *entries[],
+	       unsigned *n)
+{
+	size_t i;
+	size_t j;
+
+	assert(l != NULL);
+	assert(index > 0);
+	assert(entries != NULL);
+	assert(n != NULL);
+
+	/* Get the array index of the first entry to acquire. */
+	i = locateEntry(l, index);
+
+	if (i == l->size) {
+		*n = 0;
+		*entries = NULL;
+		return 0;
+	}
+
+	if (i < l->back) {
+		/* The last entry does not wrap with respect to i, so the number
+		 * of entries is simply the length of the range [i...l->back).
+		 */
+		*n = (unsigned)(l->back - i);
+	} else {
+		/* The last entry wraps with respect to i, so the number of
+		 * entries is the sum of the lengths of the ranges [i...l->size)
+		 * and [0...l->back), which is l->size - i + l->back.*/
+		*n = (unsigned)(l->size - i + l->back);
+	}
+
+	assert(*n > 0);
+
+	*entries = raft_calloc(*n, sizeof **entries);
+	if (*entries == NULL) {
+		return RAFT_NOMEM;
+	}
+
+	for (j = 0; j < *n; j++) {
+		size_t k = (i + j) % l->size;
+		struct raft_entry *entry = &(*entries)[j];
+		*entry = l->entries[k];
+		refsIncr(l, entry->term, index + j);
+	}
+
+	return 0;
+}
+
+/* Return true if the given batch is referenced by any entry currently in the
+ * log. */
+static bool isBatchReferenced(struct raft_log *l, const void *batch)
+{
+	size_t i;
+
+	/* Iterate through all live entries to see if there's one
+	 * belonging to the same batch. This is slightly inefficient but
+	 * this code path should be taken very rarely in practice. */
+	for (i = 0; i < logNumEntries(l); i++) {
+		struct raft_entry *entry = entryAt(l, i);
+		if (entry->batch == batch) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+void logRelease(struct raft_log *l,
+		const raft_index index,
+		struct raft_entry entries[],
+		const unsigned n)
+{
+	size_t i;
+	void *batch = NULL; /* Last batch whose memory was freed */
+
+	assert(l != NULL);
+	assert((entries == NULL && n == 0) || (entries != NULL && n > 0));
+
+	for (i = 0; i < n; i++) {
+		struct raft_entry *entry = &entries[i];
+		bool unref;
+
+		unref = refsDecr(l, entry->term, index + i);
+
+		/* If there are no outstanding references to this entry, free
+		 * its payload if it's not part of a batch, or check if we can
+		 * free the batch itself. */
+		if (unref) {
+			if (entries[i].batch == NULL) {
+				if (entry->buf.base != NULL) {
+					raft_free(entries[i].buf.base);
+				}
+			} else {
+				if (entry->batch != batch) {
+					if (!isBatchReferenced(l,
+							       entry->batch)) {
+						batch = entry->batch;
+						raft_free(batch);
+					}
+				}
+			}
+		}
+	}
+
+	if (entries != NULL) {
+		raft_free(entries);
+	}
+}
+
+/* Clear the log if it became empty. */
+static void clearIfEmpty(struct raft_log *l)
+{
+	if (logNumEntries(l) > 0) {
+		return;
+	}
+	raft_free(l->entries);
+	l->entries = NULL;
+	l->size = 0;
+	l->front = 0;
+	l->back = 0;
+}
+
+/* Destroy an entry, possibly releasing the memory of its buffer. */
+static void destroyEntry(struct raft_log *l, struct raft_entry *entry)
+{
+	if (entry->batch == NULL) {
+		if (entry->buf.base != NULL) {
+			raft_free(entry->buf.base);
+		}
+	} else {
+		if (!isBatchReferenced(l, entry->batch)) {
+			raft_free(entry->batch);
+		}
+	}
+}
+
+/* Core logic of @logTruncate and @logDiscard, removing all log entries from
+ * @index onward. If @destroy is true, also destroy the removed entries. */
+static void removeSuffix(struct raft_log *l,
+			 const raft_index index,
+			 bool destroy)
+{
+	size_t i;
+	size_t n;
+	raft_index start = index;
+
+	assert(l != NULL);
+	assert(index > l->offset);
+	assert(index <= logLastIndex(l));
+
+	/* Number of entries to delete */
+	n = (size_t)(logLastIndex(l) - start) + 1;
+
+	for (i = 0; i < n; i++) {
+		struct raft_entry *entry;
+		bool unref;
+
+		if (l->back == 0) {
+			l->back = l->size - 1;
+		} else {
+			l->back--;
+		}
+
+		entry = &l->entries[l->back];
+		unref = refsDecr(l, entry->term, start + n - i - 1);
+
+		if (unref && destroy) {
+			destroyEntry(l, entry);
+		}
+	}
+
+	clearIfEmpty(l);
+}
+
+void logTruncate(struct raft_log *l, const raft_index index)
+{
+	if (logNumEntries(l) == 0) {
+		return;
+	}
+	removeSuffix(l, index, true);
+}
+
+void logDiscard(struct raft_log *l, const raft_index index)
+{
+	removeSuffix(l, index, false);
+}
+
+/* Delete all entries up to the given index (included). */
+static void removePrefix(struct raft_log *l, const raft_index index)
+{
+	size_t i;
+	size_t n;
+
+	assert(l != NULL);
+	assert(index > 0);
+	assert(index <= logLastIndex(l));
+
+	/* Number of entries to delete */
+	n = (size_t)(index - indexAt(l, 0)) + 1;
+
+	for (i = 0; i < n; i++) {
+		struct raft_entry *entry;
+		bool unref;
+
+		entry = &l->entries[l->front];
+
+		if (l->front == l->size - 1) {
+			l->front = 0;
+		} else {
+			l->front++;
+		}
+		l->offset++;
+
+		unref = refsDecr(l, entry->term, l->offset);
+
+		if (unref) {
+			destroyEntry(l, entry);
+		}
+	}
+
+	clearIfEmpty(l);
+}
+
+void logSnapshot(struct raft_log *l, raft_index last_index, unsigned trailing)
+{
+	raft_term last_term = logTermOf(l, last_index);
+
+	/* We must have an entry at this index */
+	assert(last_term != 0);
+
+	l->snapshot.last_index = last_index;
+	l->snapshot.last_term = last_term;
+
+	/* If we have not at least n entries preceeding the given last index,
+	 * then there's nothing to remove and we're done. */
+	if (last_index <= trailing ||
+	    locateEntry(l, last_index - trailing) == l->size) {
+		return;
+	}
+
+	removePrefix(l, last_index - trailing);
+}
+
+void logRestore(struct raft_log *l, raft_index last_index, raft_term last_term)
+{
+	size_t n = logNumEntries(l);
+	assert(last_index > 0);
+	assert(last_term > 0);
+	if (n > 0) {
+		logTruncate(l, logLastIndex(l) - n + 1);
+	}
+	l->snapshot.last_index = last_index;
+	l->snapshot.last_term = last_term;
+	l->offset = last_index;
+}
diff --git a/src/raft/log.h b/src/raft/log.h
new file mode 100644
index 000000000..46707d2f5
--- /dev/null
+++ b/src/raft/log.h
@@ -0,0 +1,169 @@
+/* In-memory cache of the persistent raft log stored on disk. */
+
+#ifndef RAFT_LOG_H_
+#define RAFT_LOG_H_
+
+#include "../raft.h"
+
+/* Initial size of the entry reference count hash table. */
+#define LOG__REFS_INITIAL_SIZE 256
+
+/**
+ * Counter for outstanding references to a log entry.
+ *
+ * When an entry is first appended to the log, its refcount is set to one (the
+ * log itself is the only one referencing the entry). Whenever an entry is
+ * included in an I/O request (to write it to disk or to send it to other
+ * servers) its refcount is increased by one. Whenever an entry gets deleted
+ * from the log its refcount is decreased by one. Likewise, whenever an I/O
+ * request is completed the refcount of the relevant entries is decreased by
+ * one. When the refcount drops to zero the memory that its @buf attribute
+ * points to gets released, or, if the @batch attribute is non-NULL, a check is
+ * made to see if all other entries of the same batch also have a zero refcount,
+ * and the memory that @batch points to gets released if that's the case.
+ */
+struct raft_entry_ref
+{
+	raft_term term;       /* Term of the entry being ref-counted. */
+	raft_index index;     /* Index of the entry being ref-counted. */
+	unsigned short count; /* Number of references. */
+	/* The next two fields are copied from the corresponding fields of the
+	 * raft_entry pointed to by this reference. We store them here as well,
+	 * so that logReinstate can retrieve them when it finds a raft_entry_ref
+	 * with the same index and term as it was passed, and create a full
+	 * raft_entry using them. */
+	struct raft_buffer buf;
+	void *batch;
+	struct raft_entry_ref
+	    *next; /* Next item in the bucket (for collisions). */
+};
+
+/**
+ * In-memory cache of the persistent raft log stored on disk.
+ *
+ * The raft log cache is implemented as a circular buffer of log entries, which
+ * makes some frequent operations very efficient (e.g. deleting the first N
+ * entries when snapshotting).
+ */
+struct raft_log
+{
+	struct raft_entry *entries; /* Circular buffer of log entries. */
+	size_t size;        /* Number of available slots in the buffer. */
+	size_t front, back; /* Indexes of used slots [front, back). */
+	raft_index offset;  /* Index of first entry is offset+1. */
+	struct raft_entry_ref
+	    *refs;        /* Log entries reference counts hash table. */
+	size_t refs_size; /* Size of the reference counts hash table. */
+	struct            /* Information about last snapshot, or zero. */
+	{
+		raft_index
+		    last_index; /* Snapshot replaces all entries up to here. */
+		raft_term last_term; /* Term of last index. */
+	} snapshot;
+};
+
+/* Initialize an empty in-memory log of raft entries. */
+struct raft_log *logInit(void);
+
+/* Release all memory used by the given log object. */
+void logClose(struct raft_log *l);
+
+/* Called at startup when populating the log with entries loaded from disk. It
+ * sets the starting state of the log. The start index must be lower or equal
+ * than snapshot_index + 1. */
+void logStart(struct raft_log *l,
+	      raft_index snapshot_index,
+	      raft_term snapshot_term,
+	      raft_index start_index);
+
+/* Get the number of entries the log currently contains. */
+size_t logNumEntries(struct raft_log *l);
+
+/* Get the index of the last entry in the log. Return #0 if the log is empty. */
+raft_index logLastIndex(struct raft_log *l);
+
+/* Get the term of the last entry in the log. Return #0 if the log is empty. */
+raft_term logLastTerm(struct raft_log *l);
+
+/* Get the term of the entry with the given index. Return #0 if @index is *
+ * greater than the last index of the log, or if it's lower than oldest index we
+ * know the term of (either because it's outstanding or because it's the last
+ * entry in the most recent snapshot). */
+raft_term logTermOf(struct raft_log *l, raft_index index);
+
+/* Get the index of the last entry in the most recent snapshot. Return #0 if
+ * there are no snapshots. */
+raft_index logSnapshotIndex(struct raft_log *l);
+
+/* Get the entry with the given index. The returned pointer remains valid only
+ * as long as no API that might delete the entry with the given index is
+ * invoked. Return #NULL if there is no such entry. */
+const struct raft_entry *logGet(struct raft_log *l, const raft_index index);
+
+/* Check whether the hash map is already tracking an entry with the given
+ * @term and @index (that is not part of the "logical" log). If so, increment
+ * the refcount of that entry and set @reinstated to true; otherwise, set
+ * @reinstated to false. */
+int logReinstate(struct raft_log *l,
+		 raft_term term,
+		 unsigned short type,
+		 bool *reinstated);
+
+/* Append a new entry to the log. */
+int logAppend(struct raft_log *l,
+	      raft_term term,
+	      unsigned short type,
+	      const struct raft_buffer *buf,
+	      void *batch);
+
+/* Convenience to append a series of #RAFT_COMMAND entries. */
+int logAppendCommands(struct raft_log *l,
+		      const raft_term term,
+		      const struct raft_buffer bufs[],
+		      const unsigned n);
+
+/* Convenience to encode and append a single #RAFT_CHANGE entry. */
+int logAppendConfiguration(struct raft_log *l,
+			   const raft_term term,
+			   const struct raft_configuration *configuration);
+
+/* Acquire an array of entries from the given index onwards. The payload
+ * memory referenced by the @buf attribute of the returned entries is guaranteed
+ * to be valid until logRelease() is called. */
+int logAcquire(struct raft_log *l,
+	       raft_index index,
+	       struct raft_entry *entries[],
+	       unsigned *n);
+
+/* Release a previously acquired array of entries. */
+void logRelease(struct raft_log *l,
+		raft_index index,
+		struct raft_entry entries[],
+		unsigned n);
+
+/* Delete all entries from the given index (included) onwards. If the log is
+ * empty this is a no-op. If @index is lower than or equal to the index of the
+ * first entry in the log, then the log will become empty. */
+void logTruncate(struct raft_log *l, const raft_index index);
+
+/* Discard all entries from the given index (included) onwards. This is exactly
+ * the same as truncate, but the memory of the entries does not gets
+ * released. This is called as part of error handling, when reverting the effect
+ * of previous logAppend calls. */
+void logDiscard(struct raft_log *l, const raft_index index);
+
+/* To be called when taking a new snapshot. The log must contain an entry at
+ * last_index, which is the index of the last entry included in the
+ * snapshot. The function will update the last snapshot information and delete
+ * all entries up to last_index - trailing (included). If the log contains no
+ * entry at last_index - trailing, then no entry will be deleted. */
+void logSnapshot(struct raft_log *l, raft_index last_index, unsigned trailing);
+
+/* To be called when installing a snapshot.
+ *
+ * The log can be in any state. All outstanding entries will be discarded, the
+ * last index and last term of the most recent snapshot will be set to the given
+ * values, and the offset adjusted accordingly. */
+void logRestore(struct raft_log *l, raft_index last_index, raft_term last_term);
+
+#endif /* RAFT_LOG_H_ */
diff --git a/src/raft/membership.c b/src/raft/membership.c
new file mode 100644
index 000000000..f810c7722
--- /dev/null
+++ b/src/raft/membership.c
@@ -0,0 +1,279 @@
+#include "membership.h"
+
+#include "../raft.h"
+#include "../tracing.h"
+#include "assert.h"
+#include "configuration.h"
+#include "err.h"
+#include "heap.h"
+#include "log.h"
+#include "progress.h"
+
+int membershipCanChangeConfiguration(struct raft *r)
+{
+	int rv;
+
+	if (r->state != RAFT_LEADER || r->transfer != NULL) {
+		tracef("NOT LEADER");
+		rv = RAFT_NOTLEADER;
+		goto err;
+	}
+
+	if (r->configuration_uncommitted_index != 0) {
+		tracef("r->configuration_uncommitted_index %llu",
+		       r->configuration_uncommitted_index);
+		rv = RAFT_CANTCHANGE;
+		goto err;
+	}
+
+	if (r->leader_state.promotee_id != 0) {
+		tracef("r->leader_state.promotee_id %llu",
+		       r->leader_state.promotee_id);
+		rv = RAFT_CANTCHANGE;
+		goto err;
+	}
+
+	/* In order to become leader at all we are supposed to have committed at
+	 * least the initial configuration at index 1. */
+	assert(r->configuration_committed_index > 0);
+
+	/* The index of the last committed configuration can't be greater than
+	 * the last log index. */
+	assert(logLastIndex(r->log) >= r->configuration_committed_index);
+
+	/* No catch-up round should be in progress. */
+	assert(r->leader_state.round_number == 0);
+	assert(r->leader_state.round_index == 0);
+	assert(r->leader_state.round_start == 0);
+
+	return 0;
+
+err:
+	assert(rv != 0);
+	ErrMsgFromCode(r->errmsg, rv);
+	return rv;
+}
+
+int membershipFetchLastCommittedConfiguration(struct raft *r,
+					      struct raft_configuration *conf)
+{
+	const struct raft_entry *entry;
+	int rv;
+
+	/* Try to get the entry at r->configuration_committed_index from the
+	 * log. If the entry is not present in the log anymore because the log
+	 * was truncated after a snapshot, we can just use
+	 * configuration_last_snapshot, which we cached when we took or restored
+	 * the snapshot and is guaranteed to match the content that the entry at
+	 * r->configuration_committed_index had. */
+	entry = logGet(r->log, r->configuration_committed_index);
+	if (entry != NULL) {
+		rv = configurationDecode(&entry->buf, conf);
+	} else {
+		assert(r->configuration_last_snapshot.n > 0);
+		rv = configurationCopy(&r->configuration_last_snapshot, conf);
+	}
+	if (rv != 0) {
+		return rv;
+	}
+
+	return 0;
+}
+
+bool membershipUpdateCatchUpRound(struct raft *r)
+{
+	unsigned server_index;
+	raft_index match_index;
+	raft_index last_index;
+	raft_time now = r->io->time(r->io);
+	raft_time round_duration;
+	bool is_up_to_date;
+	bool is_fast_enough;
+
+	assert(r->state == RAFT_LEADER);
+	assert(r->leader_state.promotee_id != 0);
+
+	server_index = configurationIndexOf(&r->configuration,
+					    r->leader_state.promotee_id);
+	assert(server_index < r->configuration.n);
+
+	match_index = progressMatchIndex(r, server_index);
+
+	/* If the server did not reach the target index for this round, it did
+	 * not catch up. */
+	if (match_index < r->leader_state.round_index) {
+		tracef(
+		    "member (index: %u) not yet caught up match_index:%llu "
+		    "round_index:%llu",
+		    server_index, match_index, r->leader_state.round_index);
+		return false;
+	}
+
+	last_index = logLastIndex(r->log);
+	round_duration = now - r->leader_state.round_start;
+
+	is_up_to_date = match_index == last_index;
+	is_fast_enough = round_duration < r->election_timeout;
+
+	tracef("member is_up_to_date:%d is_fast_enough:%d", is_up_to_date,
+	       is_fast_enough);
+
+	/* If the server's log is fully up-to-date or the round that just
+	 * terminated was fast enough, then the server as caught up. */
+	if (is_up_to_date || is_fast_enough) {
+		r->leader_state.round_number = 0;
+		r->leader_state.round_index = 0;
+		r->leader_state.round_start = 0;
+
+		return true;
+	}
+
+	/* If we get here it means that this catch-up round is complete, but
+	 * there are more entries to replicate, or it was not fast enough. Let's
+	 * start a new round. */
+	r->leader_state.round_number++;
+	r->leader_state.round_index = last_index;
+	r->leader_state.round_start = now;
+
+	return false;
+}
+
+int membershipUncommittedChange(struct raft *r,
+				const raft_index index,
+				const struct raft_entry *entry)
+{
+	struct raft_configuration configuration;
+	int rv;
+	char msg[128];
+
+	assert(r != NULL);
+	assert(r->state == RAFT_FOLLOWER);
+	assert(entry != NULL);
+	assert(entry->type == RAFT_CHANGE);
+
+	rv = configurationDecode(&entry->buf, &configuration);
+	if (rv != 0) {
+		tracef("failed to decode configuration at index:%llu", index);
+		goto err;
+	}
+
+	/* ignore errors */
+	snprintf(msg, sizeof(msg), "uncommitted config change at index:%llu",
+		 index);
+	configurationTrace(r, &configuration, msg);
+
+	raft_configuration_close(&r->configuration);
+
+	r->configuration = configuration;
+	r->configuration_uncommitted_index = index;
+
+	return 0;
+
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+int membershipRollback(struct raft *r)
+{
+	int rv;
+
+	assert(r != NULL);
+	assert(r->state == RAFT_FOLLOWER);
+	assert(r->configuration_uncommitted_index > 0);
+	tracef("roll back membership");
+
+	/* Fetch the last committed configuration entry. */
+	assert(r->configuration_committed_index != 0);
+
+	/* Replace the current configuration with the last committed one. */
+	configurationClose(&r->configuration);
+	rv = membershipFetchLastCommittedConfiguration(r, &r->configuration);
+	if (rv != 0) {
+		return rv;
+	}
+
+	configurationTrace(r, &r->configuration, "roll back config");
+	r->configuration_uncommitted_index = 0;
+	return 0;
+}
+
+void membershipLeadershipTransferInit(struct raft *r,
+				      struct raft_transfer *req,
+				      raft_id id,
+				      raft_transfer_cb cb)
+{
+	req->cb = cb;
+	req->id = id;
+	req->start = r->io->time(r->io);
+	req->send.data = NULL;
+	r->transfer = req;
+}
+
+static void membershipLeadershipSendCb(struct raft_io_send *send, int status)
+{
+	(void)status;
+	RaftHeapFree(send);
+}
+
+int membershipLeadershipTransferStart(struct raft *r)
+{
+	const struct raft_server *server;
+	struct raft_message message;
+	struct raft_io_send *send;
+	int rv;
+	assert(r->transfer->send.data == NULL);
+	server = configurationGet(&r->configuration, r->transfer->id);
+	assert(server != NULL);
+	if (server == NULL) {
+		tracef("transferee server not found in configuration");
+		return -1;
+	}
+
+	/* Don't use the raft_io_send object embedded in struct raft_transfer,
+	 * since the two objects must have different lifetimes. For example
+	 * raft_io_send might live longer than raft_transfer, see #396.
+	 *
+	 * Ideally we should remove the embedded struct raft_io_send send field
+	 * from struct raft_transfer, and replace it with a raft_io_send *send
+	 * pointer, that we set to the raft_io_send object allocated in this
+	 * function. This would break ABI compatibility though. */
+	send = RaftHeapMalloc(sizeof *send);
+	if (send == NULL) {
+		return RAFT_NOMEM;
+	}
+
+	message.type = RAFT_IO_TIMEOUT_NOW;
+	message.server_id = server->id;
+	message.server_address = server->address;
+	message.timeout_now.term = r->current_term;
+	message.timeout_now.last_log_index = logLastIndex(r->log);
+	message.timeout_now.last_log_term = logLastTerm(r->log);
+
+	/* Set the data attribute of the raft_io_send object embedded in
+	 * raft_transfer. This is needed because we historically used it as a
+	 * flag to indicate that a transfer request was sent. See the
+	 * replicationUpdate function. */
+	r->transfer->send.data = r;
+
+	send->data = r;
+
+	rv = r->io->send(r->io, send, &message, membershipLeadershipSendCb);
+	if (rv != 0) {
+		RaftHeapFree(send);
+		ErrMsgTransferf(r->io->errmsg, r->errmsg,
+				"send timeout now to %llu", server->id);
+		return rv;
+	}
+	return 0;
+}
+
+void membershipLeadershipTransferClose(struct raft *r)
+{
+	struct raft_transfer *req = r->transfer;
+	raft_transfer_cb cb = req->cb;
+	r->transfer = NULL;
+	if (cb != NULL) {
+		cb(req);
+	}
+}
diff --git a/src/raft/membership.h b/src/raft/membership.h
new file mode 100644
index 000000000..15769c9cd
--- /dev/null
+++ b/src/raft/membership.h
@@ -0,0 +1,59 @@
+/* Membership-related APIs. */
+
+#ifndef MEMBERSHIP_H_
+#define MEMBERSHIP_H_
+
+#include "../raft.h"
+
+/* Helper returning an error if the configuration can't be changed, either
+ * because this node is not the leader or because a configuration change is
+ * already in progress. */
+int membershipCanChangeConfiguration(struct raft *r);
+
+/* Populate the given configuration object with the most recent committed
+ * configuration, the one contained in the entry at
+ * r->configuration_committed_index. */
+int membershipFetchLastCommittedConfiguration(struct raft *r,
+					      struct raft_configuration *conf);
+
+/* Update the information about the progress that the non-voting server
+ * currently being promoted is making in catching with logs.
+ *
+ * Return false if the server being promoted did not yet catch-up with logs, and
+ * true if it did.
+ *
+ * This function must be called only by leaders after a @raft_assign request
+ * has been submitted. */
+bool membershipUpdateCatchUpRound(struct raft *r);
+
+/* Update the local configuration replacing it with the content of the given
+ * RAFT_CHANGE entry, which has just been received in as part of an
+ * AppendEntries RPC request. The uncommitted configuration index will be
+ * updated accordingly.
+ *
+ * It must be called only by followers. */
+int membershipUncommittedChange(struct raft *r,
+				const raft_index index,
+				const struct raft_entry *entry);
+
+/* Rollback any promotion configuration change that was applied locally, but
+ * failed to be committed. It must be called by followers after they receive an
+ * AppendEntries RPC request that instructs them to evict the uncommitted entry
+ * from their log. */
+int membershipRollback(struct raft *r);
+
+/* Initialize the state of a leadership transfer request. */
+void membershipLeadershipTransferInit(struct raft *r,
+				      struct raft_transfer *req,
+				      raft_id id,
+				      raft_transfer_cb cb);
+
+/* Start the leadership transfer by sending a TimeoutNow message to the target
+ * server. */
+int membershipLeadershipTransferStart(struct raft *r);
+
+/* Finish a leadership transfer (whether successful or not), resetting the
+ * leadership transfer state and firing the user callback. */
+void membershipLeadershipTransferClose(struct raft *r);
+
+#endif /* MEMBERSHIP_H_ */
diff --git a/src/raft/progress.c b/src/raft/progress.c
new file mode 100644
index 000000000..696134c70
--- /dev/null
+++ b/src/raft/progress.c
@@ -0,0 +1,325 @@
+#include "progress.h"
+
+#include "../tracing.h"
+#include "assert.h"
+#include "configuration.h"
+#include "log.h"
+
+#ifndef max
+#define max(a, b) ((a) < (b) ? (b) : (a))
+#endif
+
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+/* Initialize a single progress object. */
+static void initProgress(struct raft_progress *p, raft_index last_index)
+{
+	p->next_index = last_index + 1;
+	p->match_index = 0;
+	p->snapshot_index = 0;
+	p->last_send = 0;
+	p->snapshot_last_send = 0;
+	p->recent_recv = false;
+	p->state = PROGRESS__PROBE;
+	p->features = 0;
+}
+
+int progressBuildArray(struct raft *r)
+{
+	struct raft_progress *progress;
+	unsigned i;
+	raft_index last_index = logLastIndex(r->log);
+	progress = raft_malloc(r->configuration.n * sizeof *progress);
+	if (progress == NULL) {
+		return RAFT_NOMEM;
+	}
+	for (i = 0; i < r->configuration.n; i++) {
+		initProgress(&progress[i], last_index);
+		if (r->configuration.servers[i].id == r->id) {
+			progress[i].match_index = r->last_stored;
+		}
+	}
+	r->leader_state.progress = progress;
+	return 0;
+}
+
+int progressRebuildArray(struct raft *r,
+			 const struct raft_configuration *configuration)
+{
+	raft_index last_index = logLastIndex(r->log);
+	struct raft_progress *progress;
+	unsigned i;
+	unsigned j;
+	raft_id id;
+
+	progress = raft_malloc(configuration->n * sizeof *progress);
+	if (progress == NULL) {
+		return RAFT_NOMEM;
+	}
+
+	/* First copy the progress information for the servers that exists both
+	 * in the current and in the new configuration. */
+	for (i = 0; i < r->configuration.n; i++) {
+		id = r->configuration.servers[i].id;
+		j = configurationIndexOf(configuration, id);
+		if (j == configuration->n) {
+			/* This server is not present in the new configuration,
+			 * so we just skip it. */
+			continue;
+		}
+		progress[j] = r->leader_state.progress[i];
+	}
+
+	/* Then reset the replication state for servers that are present in the
+	 * new configuration, but not in the current one. */
+	for (i = 0; i < configuration->n; i++) {
+		id = configuration->servers[i].id;
+		j = configurationIndexOf(&r->configuration, id);
+		if (j < r->configuration.n) {
+			/* This server is present both in the new and in the
+			 * current configuration, so we have already copied its
+			 * next/match index value in the loop above. */
+			continue;
+		}
+		assert(j == r->configuration.n);
+		initProgress(&progress[i], last_index);
+	}
+
+	raft_free(r->leader_state.progress);
+	r->leader_state.progress = progress;
+
+	return 0;
+}
+
+bool progressIsUpToDate(struct raft *r, unsigned i)
+{
+	struct raft_progress *p = &r->leader_state.progress[i];
+	raft_index last_index = logLastIndex(r->log);
+	return p->next_index == last_index + 1;
+}
+
+bool progressPersistedIsUpToDate(struct raft *r, unsigned i)
+{
+	struct raft_progress *p = &r->leader_state.progress[i];
+	raft_index last_index = logLastIndex(r->log);
+	return p->match_index == last_index;
+}
+
+bool progressShouldReplicate(struct raft *r, unsigned i)
+{
+	struct raft_progress *p = &r->leader_state.progress[i];
+	raft_time now = r->io->time(r->io);
+	bool needs_heartbeat = now - p->last_send >= r->heartbeat_timeout;
+	raft_index last_index = logLastIndex(r->log);
+	bool result = false;
+
+	/* We must be in a valid state. */
+	assert(p->state == PROGRESS__PROBE || p->state == PROGRESS__PIPELINE ||
+	       p->state == PROGRESS__SNAPSHOT);
+
+	/* The next index to send must be lower than the highest index in our
+	 * log. */
+	assert(p->next_index <= last_index + 1);
+
+	switch (p->state) {
+		case PROGRESS__SNAPSHOT:
+			/* Snapshot timed out, move to PROBE */
+			if (now - p->snapshot_last_send >=
+			    r->install_snapshot_timeout) {
+				tracef("snapshot timed out for index:%u", i);
+				result = true;
+				progressAbortSnapshot(r, i);
+			} else {
+				/* Enforce Leadership during follower Snapshot
+				 * installation */
+				result = needs_heartbeat;
+			}
+			break;
+		case PROGRESS__PROBE:
+			/* We send at most one message per heartbeat interval.
+			 */
+			result = needs_heartbeat;
+			break;
+		case PROGRESS__PIPELINE:
+			/* In replication mode we send empty append entries
+			 * messages only if haven't sent anything in the last
+			 * heartbeat interval. */
+			result = !progressIsUpToDate(r, i) || needs_heartbeat;
+			break;
+	}
+	return result;
+}
+
+raft_index progressNextIndex(struct raft *r, unsigned i)
+{
+	return r->leader_state.progress[i].next_index;
+}
+
+raft_index progressMatchIndex(struct raft *r, unsigned i)
+{
+	return r->leader_state.progress[i].match_index;
+}
+
+void progressUpdateLastSend(struct raft *r, unsigned i)
+{
+	r->leader_state.progress[i].last_send = r->io->time(r->io);
+}
+
+void progressUpdateSnapshotLastSend(struct raft *r, unsigned i)
+{
+	r->leader_state.progress[i].snapshot_last_send = r->io->time(r->io);
+}
+
+bool progressResetRecentRecv(struct raft *r, const unsigned i)
+{
+	bool prev = r->leader_state.progress[i].recent_recv;
+	r->leader_state.progress[i].recent_recv = false;
+	return prev;
+}
+
+void progressMarkRecentRecv(struct raft *r, const unsigned i)
+{
+	r->leader_state.progress[i].recent_recv = true;
+}
+
+inline void progressSetFeatures(struct raft *r,
+				const unsigned i,
+				raft_flags features)
+{
+	r->leader_state.progress[i].features = features;
+}
+
+inline raft_flags progressGetFeatures(struct raft *r, const unsigned i)
+{
+	return r->leader_state.progress[i].features;
+}
+
+bool progressGetRecentRecv(const struct raft *r, const unsigned i)
+{
+	return r->leader_state.progress[i].recent_recv;
+}
+
+void progressToSnapshot(struct raft *r, unsigned i)
+{
+	struct raft_progress *p = &r->leader_state.progress[i];
+	p->state = PROGRESS__SNAPSHOT;
+	p->snapshot_index = logSnapshotIndex(r->log);
+}
+
+void progressAbortSnapshot(struct raft *r, const unsigned i)
+{
+	struct raft_progress *p = &r->leader_state.progress[i];
+	p->snapshot_index = 0;
+	p->state = PROGRESS__PROBE;
+}
+
+int progressState(struct raft *r, const unsigned i)
+{
+	struct raft_progress *p = &r->leader_state.progress[i];
+	return p->state;
+}
+
+bool progressMaybeDecrement(struct raft *r,
+			    const unsigned i,
+			    raft_index rejected,
+			    raft_index last_index)
+{
+	struct raft_progress *p = &r->leader_state.progress[i];
+
+	assert(p->state == PROGRESS__PROBE || p->state == PROGRESS__PIPELINE ||
+	       p->state == PROGRESS__SNAPSHOT);
+
+	if (p->state == PROGRESS__SNAPSHOT) {
+		/* The rejection must be stale or spurious if the rejected index
+		 * does not match the last snapshot index. */
+		if (rejected != p->snapshot_index) {
+			return false;
+		}
+		progressAbortSnapshot(r, i);
+		return true;
+	}
+
+	if (p->state == PROGRESS__PIPELINE) {
+		/* The rejection must be stale if the rejected index is smaller
+		 * than the matched one. */
+		if (rejected <= p->match_index) {
+			tracef("match index is up to date -> ignore ");
+			return false;
+		}
+		/* Directly decrease next to match + 1 */
+		p->next_index = min(rejected, p->match_index + 1);
+		progressToProbe(r, i);
+		return true;
+	}
+
+	/* The rejection must be stale or spurious if the rejected index does
+	 * not match the next index minus one. */
+	if (rejected != p->next_index - 1) {
+		tracef(
+		    "rejected index %llu different from next index %lld -> "
+		    "ignore ",
+		    rejected, p->next_index);
+		return false;
+	}
+
+	p->next_index = min(rejected, last_index + 1);
+	p->next_index = max(p->next_index, 1);
+
+	return true;
+}
+
+void progressOptimisticNextIndex(struct raft *r,
+				 unsigned i,
+				 raft_index next_index)
+{
+	struct raft_progress *p = &r->leader_state.progress[i];
+	p->next_index = next_index;
+}
+
+bool progressMaybeUpdate(struct raft *r, unsigned i, raft_index last_index)
+{
+	struct raft_progress *p = &r->leader_state.progress[i];
+	bool updated = false;
+	if (p->match_index < last_index) {
+		p->match_index = last_index;
+		updated = true;
+	}
+	if (p->next_index < last_index + 1) {
+		p->next_index = last_index + 1;
+	}
+	return updated;
+}
+
+void progressToProbe(struct raft *r, const unsigned i)
+{
+	struct raft_progress *p = &r->leader_state.progress[i];
+
+	/* If the current state is snapshot, we know that the pending snapshot
+	 * has been sent to this peer successfully, so we probe from
+	 * snapshot_index + 1.*/
+	if (p->state == PROGRESS__SNAPSHOT) {
+		assert(p->snapshot_index > 0);
+		p->next_index = max(p->match_index + 1, p->snapshot_index);
+		p->snapshot_index = 0;
+	} else {
+		p->next_index = p->match_index + 1;
+	}
+	p->state = PROGRESS__PROBE;
+}
+
+void progressToPipeline(struct raft *r, const unsigned i)
+{
+	struct raft_progress *p = &r->leader_state.progress[i];
+	p->state = PROGRESS__PIPELINE;
+}
+
+bool progressSnapshotDone(struct raft *r, const unsigned i)
+{
+	struct raft_progress *p = &r->leader_state.progress[i];
+	assert(p->state == PROGRESS__SNAPSHOT);
+	return p->match_index >= p->snapshot_index;
+}
+
+#undef tracef
diff --git a/src/raft/progress.h b/src/raft/progress.h
new file mode 100644
index 000000000..b1de1a0e6
--- /dev/null
+++ b/src/raft/progress.h
@@ -0,0 +1,139 @@
+/* Track replication progress on followers. */
+
+#ifndef PROGRESS_H_
+#define PROGRESS_H_
+
+#include "../raft.h"
+
+/* Possible values for the state field of struct raft_progress. */
+enum {
+	PROGRESS__PROBE =
+	    0, /* At most one AppendEntries per heartbeat interval */
+	PROGRESS__PIPELINE, /* Optimistically stream AppendEntries */
+	PROGRESS__SNAPSHOT  /* Sending a snapshot */
+};
+
+/**
+ * Used by leaders to keep track of replication progress for each server.
+ */
+struct raft_progress
+{
+	unsigned short state;   /* Probe, pipeline or snapshot. */
+	raft_index next_index;  /* Next entry to send. */
+	raft_index match_index; /* Highest index reported as replicated. */
+	raft_index
+	    snapshot_index;  /* Last index of most recent snapshot sent. */
+	raft_time last_send; /* Timestamp of last AppendEntries RPC. */
+	raft_time
+	    snapshot_last_send; /* Timestamp of last InstallSnaphot RPC. */
+	bool recent_recv;    /* A msg was received within election timeout. */
+	raft_flags features; /* What the server is capable of. */
+};
+
+/* Create and initialize the array of progress objects used by the leader to *
+ * track followers. The match index will be set to zero, and the next index to
+ * the current last index plus 1. */
+int progressBuildArray(struct raft *r);
+
+/* Re-build the progress array against a new configuration.
+ *
+ * Progress information for servers existing both in the new and in the current
+ * configuration will remain unchanged.
+ *
+ * Progress information for servers existing only in the new configuration will
+ * be initialized as in progressBuildArray().*/
+int progressRebuildArray(struct raft *r,
+			 const struct raft_configuration *configuration);
+
+/* Whether the i'th server in the configuration has been sent all the log
+ * entries. */
+bool progressIsUpToDate(struct raft *r, unsigned i);
+
+/* Whether the persisted log of the i'th server in the configuration up-to-date
+ * with ours. */
+bool progressPersistedIsUpToDate(struct raft *r, unsigned i);
+
+/* Whether a new AppendEntries or InstallSnapshot message should be sent to the
+ * i'th server at this time.
+ *
+ * See the docstring of replicationProgress() for details about how the decision
+ * is taken. */
+bool progressShouldReplicate(struct raft *r, unsigned i);
+
+/* Return the index of the next entry that should be sent to the i'th server. */
+raft_index progressNextIndex(struct raft *r, unsigned i);
+
+/* Return the index of the most recent entry that the i'th server has reported
+ * as replicated. */
+raft_index progressMatchIndex(struct raft *r, unsigned i);
+
+/* Update the last_send timestamp after an AppendEntries request has been
+ * sent. */
+void progressUpdateLastSend(struct raft *r, unsigned i);
+
+/* Update the snapshot_last_send timestamp after an InstallSnaphot request has
+ * been sent. */
+void progressUpdateSnapshotLastSend(struct raft *r, unsigned i);
+
+/* Reset to false the recent_recv flag of the server at the given index,
+ * returning the previous value.
+ *
+ * To be called once every election_timeout milliseconds. */
+bool progressResetRecentRecv(struct raft *r, unsigned i);
+
+/* Set to true the recent_recv flag of the server at the given index.
+ *
+ * To be called whenever we receive an AppendEntries RPC result */
+void progressMarkRecentRecv(struct raft *r, unsigned i);
+
+/* Return the value of the recent_recv flag. */
+bool progressGetRecentRecv(const struct raft *r, unsigned i);
+
+/* Convert to the i'th server to snapshot mode. */
+void progressToSnapshot(struct raft *r, unsigned i);
+
+/* Convert to probe mode. */
+void progressToProbe(struct raft *r, unsigned i);
+
+/* Convert to pipeline mode. */
+void progressToPipeline(struct raft *r, unsigned i);
+
+/* Abort snapshot mode and switch to back to probe.
+ *
+ * Called after sending the snapshot has failed or timed out. */
+void progressAbortSnapshot(struct raft *r, unsigned i);
+
+/* Return the progress mode code for the i'th server. */
+int progressState(struct raft *r, unsigned i);
+
+/* Optimistically update the next index of the given server.
+ *
+ * Called in pipeline mode after sending new entries. */
+void progressOptimisticNextIndex(struct raft *r,
+				 unsigned i,
+				 raft_index next_index);
+
+/* Return false if the given @index comes from an outdated message. Otherwise
+ * update the progress and returns true. To be called when receiving a
+ * successful AppendEntries RPC response. */
+bool progressMaybeUpdate(struct raft *r, unsigned i, raft_index last_index);
+
+/* Return false if the given rejected index comes from an out of order
+ * message. Otherwise decrease the progress next index to min(rejected,
+ * last_index) and returns true. To be called when receiving an unsuccessful
+ * AppendEntries RPC response. */
+bool progressMaybeDecrement(struct raft *r,
+			    unsigned i,
+			    raft_index rejected,
+			    raft_index last_index);
+
+/* Return true if match_index is equal or higher than the snapshot_index. */
+bool progressSnapshotDone(struct raft *r, unsigned i);
+
+/* Sets the feature flags of a node. */
+void progressSetFeatures(struct raft *r, const unsigned i, raft_flags features);
+
+/* Gets the feature flags of a node. */
+raft_flags progressGetFeatures(struct raft *r, const unsigned i);
+
+#endif /* PROGRESS_H_ */
diff --git a/src/raft/queue.h b/src/raft/queue.h
new file mode 100644
index 000000000..1262cf554
--- /dev/null
+++ b/src/raft/queue.h
@@ -0,0 +1,57 @@
+#ifndef QUEUE_H_
+#define QUEUE_H_
+
+#include <stddef.h>
+
+typedef void *queue[2];
+
+/* Private macros. */
+#define QUEUE_NEXT(q) (*(queue **)&((*(q))[0]))
+#define QUEUE_PREV(q) (*(queue **)&((*(q))[1]))
+
+#define QUEUE_PREV_NEXT(q) (QUEUE_NEXT(QUEUE_PREV(q)))
+#define QUEUE_NEXT_PREV(q) (QUEUE_PREV(QUEUE_NEXT(q)))
+
+/* Initialize an empty queue. */
+#define QUEUE_INIT(q)                \
+	{                            \
+		QUEUE_NEXT(q) = (q); \
+		QUEUE_PREV(q) = (q); \
+	}
+
+/* Return true if the queue has no element. */
+#define QUEUE_IS_EMPTY(q) ((const queue *)(q) == (const queue *)QUEUE_NEXT(q))
+
+/* Insert an element at the back of a queue. */
+#define QUEUE_PUSH(q, e)                       \
+	{                                      \
+		QUEUE_NEXT(e) = (q);           \
+		QUEUE_PREV(e) = QUEUE_PREV(q); \
+		QUEUE_PREV_NEXT(e) = (e);      \
+		QUEUE_PREV(q) = (e);           \
+	}
+
+/* Remove the given element from the queue. Any element can be removed at any *
+ * time. */
+#define QUEUE_REMOVE(e)                             \
+	{                                           \
+		QUEUE_PREV_NEXT(e) = QUEUE_NEXT(e); \
+		QUEUE_NEXT_PREV(e) = QUEUE_PREV(e); \
+	}
+
+/* Return the element at the front of the queue. */
+#define QUEUE_HEAD(q) (QUEUE_NEXT(q))
+
+/* Return the element at the back of the queue. */
+#define QUEUE_TAIL(q) (QUEUE_PREV(q))
+
+/* Iterate over the element of a queue. * Mutating the queue while iterating
+ * results in undefined behavior. */
+#define QUEUE_FOREACH(q, e) \
+	for ((q) = QUEUE_NEXT(e); (q) != (e); (q) = QUEUE_NEXT(q))
+
+/* Return the structure holding the given element. */
+#define QUEUE_DATA(e, type, field) \
+	((type *)((void *)((char *)(e)-offsetof(type, field))))
+
+#endif /* QUEUE_H_*/
diff --git a/src/raft/raft.c b/src/raft/raft.c
new file mode 100644
index 000000000..e1ff0c41b
--- /dev/null
+++ b/src/raft/raft.c
@@ -0,0 +1,304 @@
+#include "../raft.h"
+
+#include <string.h>
+
+#include "../tracing.h"
+#include "assert.h"
+#include "byte.h"
+#include "callbacks.h"
+#include "configuration.h"
+#include "convert.h"
+#include "election.h"
+#include "err.h"
+#include "flags.h"
+#include "heap.h"
+#include "log.h"
+#include "membership.h"
+
+#define DEFAULT_ELECTION_TIMEOUT 1000          /* One second */
+#define DEFAULT_HEARTBEAT_TIMEOUT 100          /* One tenth of a second */
+#define DEFAULT_INSTALL_SNAPSHOT_TIMEOUT 30000 /* 30 seconds */
+#define DEFAULT_SNAPSHOT_THRESHOLD 1024
+#define DEFAULT_SNAPSHOT_TRAILING 2048
+
+/* Number of milliseconds after which a server promotion will be aborted if the
+ * server hasn't caught up with the logs yet. */
+#define DEFAULT_MAX_CATCH_UP_ROUNDS 10
+#define DEFAULT_MAX_CATCH_UP_ROUND_DURATION (5 * 1000)
+
+int raft_version_number(void)
+{
+	return RAFT_VERSION_NUMBER;
+}
+
+static int ioFsmVersionCheck(struct raft *r,
+			     struct raft_io *io,
+			     struct raft_fsm *fsm);
+
+int raft_init(struct raft *r,
+	      struct raft_io *io,
+	      struct raft_fsm *fsm,
+	      const raft_id id,
+	      const char *address)
+{
+	int rv;
+	assert(r != NULL);
+
+	rv = ioFsmVersionCheck(r, io, fsm);
+	if (rv != 0) {
+		goto err;
+	}
+
+	r->io = io;
+	r->io->data = r;
+	r->fsm = fsm;
+
+	r->tracer = NULL;
+
+	r->id = id;
+	/* Make a copy of the address */
+	r->address = RaftHeapMalloc(strlen(address) + 1);
+	if (r->address == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+	strcpy(r->address, address);
+	r->current_term = 0;
+	r->voted_for = 0;
+	r->log = logInit();
+	if (r->log == NULL) {
+		rv = RAFT_NOMEM;
+		goto err_after_address_alloc;
+	}
+
+	raft_configuration_init(&r->configuration);
+	raft_configuration_init(&r->configuration_last_snapshot);
+	r->configuration_committed_index = 0;
+	r->configuration_uncommitted_index = 0;
+	r->election_timeout = DEFAULT_ELECTION_TIMEOUT;
+	r->heartbeat_timeout = DEFAULT_HEARTBEAT_TIMEOUT;
+	r->install_snapshot_timeout = DEFAULT_INSTALL_SNAPSHOT_TIMEOUT;
+	r->commit_index = 0;
+	r->last_applied = 0;
+	r->last_stored = 0;
+	r->state = RAFT_UNAVAILABLE;
+	r->leader_state.voter_contacts = 0;
+	rv = raftInitCallbacks(r);
+	if (rv != 0) {
+		goto err_after_address_alloc;
+	}
+	r->transfer = NULL;
+	r->snapshot.pending.term = 0;
+	r->snapshot.threshold = DEFAULT_SNAPSHOT_THRESHOLD;
+	r->snapshot.trailing = DEFAULT_SNAPSHOT_TRAILING;
+	r->snapshot.put.data = NULL;
+	r->close_cb = NULL;
+	memset(r->errmsg, 0, sizeof r->errmsg);
+	r->pre_vote = false;
+	r->max_catch_up_rounds = DEFAULT_MAX_CATCH_UP_ROUNDS;
+	r->max_catch_up_round_duration = DEFAULT_MAX_CATCH_UP_ROUND_DURATION;
+	rv = r->io->init(r->io, r->id, r->address);
+	if (rv != 0) {
+		ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
+		goto err_after_callbacks_alloc;
+	}
+	return 0;
+
+err_after_callbacks_alloc:
+	raftDestroyCallbacks(r);
+err_after_address_alloc:
+	RaftHeapFree(r->address);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+static void ioCloseCb(struct raft_io *io)
+{
+	struct raft *r = io->data;
+	tracef("io close cb");
+	raftDestroyCallbacks(r);
+	raft_free(r->address);
+	logClose(r->log);
+	raft_configuration_close(&r->configuration);
+	raft_configuration_close(&r->configuration_last_snapshot);
+	if (r->close_cb != NULL) {
+		r->close_cb(r);
+	}
+}
+
+void raft_close(struct raft *r, void (*cb)(struct raft *r))
+{
+	assert(r->close_cb == NULL);
+	if (r->state != RAFT_UNAVAILABLE) {
+		convertToUnavailable(r);
+	}
+	r->close_cb = cb;
+	r->io->close(r->io, ioCloseCb);
+}
+
+void raft_register_state_cb(struct raft *r, raft_state_cb cb)
+{
+	struct raft_callbacks *cbs = raftGetCallbacks(r);
+	assert(cbs != NULL);
+	cbs->state_cb = cb;
+}
+
+void raft_set_election_timeout(struct raft *r, const unsigned msecs)
+{
+	r->election_timeout = msecs;
+}
+
+void raft_set_heartbeat_timeout(struct raft *r, const unsigned msecs)
+{
+	r->heartbeat_timeout = msecs;
+}
+
+void raft_set_install_snapshot_timeout(struct raft *r, const unsigned msecs)
+{
+	r->install_snapshot_timeout = msecs;
+}
+
+void raft_set_snapshot_threshold(struct raft *r, unsigned n)
+{
+	r->snapshot.threshold = n;
+}
+
+void raft_set_snapshot_trailing(struct raft *r, unsigned n)
+{
+	r->snapshot.trailing = n;
+}
+
+void raft_set_max_catch_up_rounds(struct raft *r, unsigned n)
+{
+	r->max_catch_up_rounds = n;
+}
+
+void raft_set_max_catch_up_round_duration(struct raft *r, unsigned msecs)
+{
+	r->max_catch_up_round_duration = msecs;
+}
+
+void raft_set_pre_vote(struct raft *r, bool enabled)
+{
+	r->pre_vote = enabled;
+}
+
+const char *raft_errmsg(struct raft *r)
+{
+	return r->errmsg;
+}
+
+int raft_voter_contacts(struct raft *r)
+{
+	int ret;
+	if (r->state == RAFT_LEADER) {
+		ret = (int)r->leader_state.voter_contacts;
+	} else {
+		ret = -1;
+	}
+	return ret;
+}
+
+int raft_bootstrap(struct raft *r, const struct raft_configuration *conf)
+{
+	int rv;
+
+	if (r->state != RAFT_UNAVAILABLE) {
+		return RAFT_BUSY;
+	}
+
+	rv = r->io->bootstrap(r->io, conf);
+	if (rv != 0) {
+		return rv;
+	}
+
+	return 0;
+}
+
+int raft_recover(struct raft *r, const struct raft_configuration *conf)
+{
+	int rv;
+
+	if (r->state != RAFT_UNAVAILABLE) {
+		return RAFT_BUSY;
+	}
+
+	rv = r->io->recover(r->io, conf);
+	if (rv != 0) {
+		return rv;
+	}
+
+	return 0;
+}
+
+const char *raft_strerror(int errnum)
+{
+	return errCodeToString(errnum);
+}
+
+void raft_configuration_init(struct raft_configuration *c)
+{
+	configurationInit(c);
+}
+
+void raft_configuration_close(struct raft_configuration *c)
+{
+	configurationClose(c);
+}
+
+int raft_configuration_add(struct raft_configuration *c,
+			   const raft_id id,
+			   const char *address,
+			   const int role)
+{
+	return configurationAdd(c, id, address, role);
+}
+
+int raft_configuration_encode(const struct raft_configuration *c,
+			      struct raft_buffer *buf)
+{
+	return configurationEncode(c, buf);
+}
+
+unsigned long long raft_digest(const char *text, unsigned long long n)
+{
+	struct byteSha1 sha1;
+	uint8_t value[20];
+	uint64_t n64 = byteFlip64((uint64_t)n);
+	uint64_t digest;
+
+	byteSha1Init(&sha1);
+	byteSha1Update(&sha1, (const uint8_t *)text, (uint32_t)strlen(text));
+	byteSha1Update(&sha1, (const uint8_t *)&n64, (uint32_t)(sizeof n64));
+	byteSha1Digest(&sha1, value);
+
+	memcpy(&digest, value + (sizeof value - sizeof digest), sizeof digest);
+
+	return byteFlip64(digest);
+}
+
+static int ioFsmVersionCheck(struct raft *r,
+			     struct raft_io *io,
+			     struct raft_fsm *fsm)
+{
+	if (io->version == 0) {
+		ErrMsgPrintf(r->errmsg, "io->version must be set");
+		return -1;
+	}
+
+	if (fsm->version == 0) {
+		ErrMsgPrintf(r->errmsg, "fsm->version must be set");
+		return -1;
+	}
+
+	if ((fsm->version > 2 && fsm->snapshot_async != NULL) &&
+	    ((io->version < 2) || (io->async_work == NULL))) {
+		ErrMsgPrintf(r->errmsg,
+			     "async snapshot requires io->version > 1 and "
+			     "async_work method.");
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/src/raft/recv.c b/src/raft/recv.c
new file mode 100644
index 000000000..5f0da1723
--- /dev/null
+++ b/src/raft/recv.c
@@ -0,0 +1,225 @@
+#include "recv.h"
+
+#include "../tracing.h"
+#include "assert.h"
+#include "convert.h"
+#include "entry.h"
+#include "heap.h"
+#include "log.h"
+#include "membership.h"
+#include "recv_append_entries.h"
+#include "recv_append_entries_result.h"
+#include "recv_install_snapshot.h"
+#include "recv_request_vote.h"
+#include "recv_request_vote_result.h"
+#include "recv_timeout_now.h"
+#include "string.h"
+
+/* Dispatch a single RPC message to the appropriate handler. */
+static int recvMessage(struct raft *r, struct raft_message *message)
+{
+	int rv = 0;
+
+	switch (message->type) {
+		case RAFT_IO_APPEND_ENTRIES:
+			rv = recvAppendEntries(r, message->server_id,
+					       message->server_address,
+					       &message->append_entries);
+			if (rv != 0) {
+				entryBatchesDestroy(
+				    message->append_entries.entries,
+				    message->append_entries.n_entries);
+			}
+			break;
+		case RAFT_IO_APPEND_ENTRIES_RESULT:
+			rv = recvAppendEntriesResult(
+			    r, message->server_id, message->server_address,
+			    &message->append_entries_result);
+			break;
+		case RAFT_IO_REQUEST_VOTE:
+			rv = recvRequestVote(r, message->server_id,
+					     message->server_address,
+					     &message->request_vote);
+			break;
+		case RAFT_IO_REQUEST_VOTE_RESULT:
+			rv = recvRequestVoteResult(
+			    r, message->server_id, message->server_address,
+			    &message->request_vote_result);
+			break;
+		case RAFT_IO_INSTALL_SNAPSHOT:
+			rv = recvInstallSnapshot(r, message->server_id,
+						 message->server_address,
+						 &message->install_snapshot);
+			/* Already installing a snapshot, wait for it and ignore
+			 * this one */
+			if (rv == RAFT_BUSY) {
+				raft_free(message->install_snapshot.data.base);
+				raft_configuration_close(
+				    &message->install_snapshot.conf);
+				rv = 0;
+			}
+			break;
+		case RAFT_IO_TIMEOUT_NOW:
+			rv = recvTimeoutNow(r, message->server_id,
+					    message->server_address,
+					    &message->timeout_now);
+			break;
+		default:
+			tracef("received unknown message type (%d)",
+			       message->type);
+			/* Drop message */
+			return 0;
+	};
+
+	if (rv != 0 && rv != RAFT_NOCONNECTION) {
+		tracef("recv: %d: %s", message->type, raft_strerror(rv));
+		return rv;
+	}
+
+	/* If there's a leadership transfer in progress, check if it has
+	 * completed. */
+	if (r->transfer != NULL) {
+		if (r->follower_state.current_leader.id == r->transfer->id) {
+			membershipLeadershipTransferClose(r);
+		}
+	}
+
+	return 0;
+}
+
+void recvCb(struct raft_io *io, struct raft_message *message)
+{
+	struct raft *r = io->data;
+	int rv;
+	if (r->state == RAFT_UNAVAILABLE) {
+		switch (message->type) {
+			case RAFT_IO_APPEND_ENTRIES:
+				entryBatchesDestroy(
+				    message->append_entries.entries,
+				    message->append_entries.n_entries);
+				break;
+			case RAFT_IO_INSTALL_SNAPSHOT:
+				raft_configuration_close(
+				    &message->install_snapshot.conf);
+				raft_free(message->install_snapshot.data.base);
+				break;
+		}
+		return;
+	}
+	rv = recvMessage(r, message);
+	if (rv != 0) {
+		convertToUnavailable(r);
+	}
+}
+
+int recvBumpCurrentTerm(struct raft *r, raft_term term)
+{
+	int rv;
+	char msg[128];
+
+	assert(r != NULL);
+	assert(term > r->current_term);
+
+	sprintf(msg, "remote term %lld is higher than %lld -> bump local term",
+		term, r->current_term);
+	if (r->state != RAFT_FOLLOWER) {
+		strcat(msg, " and step down");
+	}
+	tracef("%s", msg);
+
+	/* Save the new term to persistent store, resetting the vote. */
+	rv = r->io->set_term(r->io, term);
+	if (rv != 0) {
+		return rv;
+	}
+
+	/* Update our cache too. */
+	r->current_term = term;
+	r->voted_for = 0;
+
+	if (r->state != RAFT_FOLLOWER) {
+		/* Also convert to follower. */
+		convertToFollower(r);
+	}
+
+	return 0;
+}
+
+void recvCheckMatchingTerms(struct raft *r, raft_term term, int *match)
+{
+	if (term < r->current_term) {
+		*match = -1;
+	} else if (term > r->current_term) {
+		*match = 1;
+	} else {
+		*match = 0;
+	}
+}
+
+int recvEnsureMatchingTerms(struct raft *r, raft_term term, int *match)
+{
+	int rv;
+
+	assert(r != NULL);
+	assert(match != NULL);
+
+	recvCheckMatchingTerms(r, term, match);
+
+	if (*match == -1) {
+		tracef("old term - current_term:%llu other_term:%llu",
+		       r->current_term, term);
+		return 0;
+	}
+
+	/* From Figure 3.1:
+	 *
+	 *   Rules for Servers: All Servers: If RPC request or response contains
+	 *   term T > currentTerm: set currentTerm = T, convert to follower.
+	 *
+	 * From state diagram in Figure 3.3:
+	 *
+	 *   [leader]: discovers server with higher term -> [follower]
+	 *
+	 * From Section 3.3:
+	 *
+	 *   If a candidate or leader discovers that its term is out of date, it
+	 *   immediately reverts to follower state.
+	 */
+	if (*match == 1) {
+		rv = recvBumpCurrentTerm(r, term);
+		if (rv != 0) {
+			tracef("recvBumpCurrentTerm failed %d", rv);
+			return rv;
+		}
+	}
+
+	return 0;
+}
+
+int recvUpdateLeader(struct raft *r, const raft_id id, const char *address)
+{
+	assert(r->state == RAFT_FOLLOWER);
+
+	r->follower_state.current_leader.id = id;
+
+	/* If the address of the current leader is the same as the given one,
+	 * we're done. */
+	if (r->follower_state.current_leader.address != NULL &&
+	    strcmp(address, r->follower_state.current_leader.address) == 0) {
+		return 0;
+	}
+
+	if (r->follower_state.current_leader.address != NULL) {
+		RaftHeapFree(r->follower_state.current_leader.address);
+	}
+	r->follower_state.current_leader.address =
+	    RaftHeapMalloc(strlen(address) + 1);
+	if (r->follower_state.current_leader.address == NULL) {
+		return RAFT_NOMEM;
+	}
+	strcpy(r->follower_state.current_leader.address, address);
+
+	return 0;
+}
+
+#undef tracef
diff --git a/src/raft/recv.h b/src/raft/recv.h
new file mode 100644
index 000000000..df1fece75
--- /dev/null
+++ b/src/raft/recv.h
@@ -0,0 +1,44 @@
+/* Receive an RPC message. */
+
+#ifndef RECV_H_
+#define RECV_H_
+
+#include "../raft.h"
+
+/* Callback to be passed to the raft_io implementation. It will be invoked upon
+ * receiving an RPC message. */
+void recvCb(struct raft_io *io, struct raft_message *message);
+
+/* Compare a request's term with the server's current term.
+ *
+ * The match output parameter will be set to 0 if the local term matches the
+ * request's term, to -1 if the request's term is lower, and to 1 if the
+ * request's term is higher. */
+void recvCheckMatchingTerms(struct raft *r, raft_term term, int *match);
+
+/* Bump the current term and possibly step down from candidate or leader
+ * state. */
+int recvBumpCurrentTerm(struct raft *r, raft_term term);
+
+/* Common logic for RPC handlers, comparing the request's term with the server's
+ * current term and possibly deciding to reject the request or step down from
+ * candidate or leader.
+ *
+ * From Section 3.3:
+ *
+ *   If a candidate or leader discovers that its term is out of date, it
+ *   immediately reverts to follower state. If a server receives a request with
+ *   a stale term number, it rejects the request.
+ *
+ * The match output parameter will be set to 0 if the local term matches the
+ * request's term, to -1 if the request's term is lower, and to 1 if the
+ * request's term was higher but we have successfully bumped the local one to
+ * match it (and stepped down to follower in that case, if we were not
+ * follower already). */
+int recvEnsureMatchingTerms(struct raft *r, raft_term term, int *match);
+
+/* If different from the current one, update information about the current
+ * leader. Must be called only by followers. */
+int recvUpdateLeader(struct raft *r, raft_id id, const char *address);
+
+#endif /* RECV_H_ */
diff --git a/src/raft/recv_append_entries.c b/src/raft/recv_append_entries.c
new file mode 100644
index 000000000..7d4adbcc0
--- /dev/null
+++ b/src/raft/recv_append_entries.c
@@ -0,0 +1,167 @@
+#include "recv_append_entries.h"
+
+#include "../tracing.h"
+#include "assert.h"
+#include "convert.h"
+#include "entry.h"
+#include "flags.h"
+#include "heap.h"
+#include "log.h"
+#include "recv.h"
+#include "replication.h"
+
+static void recvSendAppendEntriesResultCb(struct raft_io_send *req, int status)
+{
+	(void)status;
+	RaftHeapFree(req);
+}
+
+int recvAppendEntries(struct raft *r,
+		      raft_id id,
+		      const char *address,
+		      const struct raft_append_entries *args)
+{
+	struct raft_io_send *req;
+	struct raft_message message;
+	struct raft_append_entries_result *result =
+	    &message.append_entries_result;
+	int match;
+	bool async;
+	int rv;
+
+	assert(r != NULL);
+	assert(id > 0);
+	assert(args != NULL);
+	assert(address != NULL);
+	tracef(
+	    "self:%llu from:%llu@%s leader_commit:%llu n_entries:%d "
+	    "prev_log_index:%llu prev_log_term:%llu, term:%llu",
+	    r->id, id, address, args->leader_commit, args->n_entries,
+	    args->prev_log_index, args->prev_log_term, args->term);
+
+	result->rejected = args->prev_log_index;
+	result->last_log_index = logLastIndex(r->log);
+	result->version = RAFT_APPEND_ENTRIES_RESULT_VERSION;
+	result->features = RAFT_DEFAULT_FEATURE_FLAGS;
+
+	rv = recvEnsureMatchingTerms(r, args->term, &match);
+	if (rv != 0) {
+		return rv;
+	}
+
+	/* From Figure 3.1:
+	 *
+	 *   AppendEntries RPC: Receiver implementation: Reply false if term <
+	 *   currentTerm.
+	 */
+	if (match < 0) {
+		tracef("local term is higher -> reject ");
+		goto reply;
+	}
+
+	/* If we get here it means that the term in the request matches our
+	 * current term or it was higher and we have possibly stepped down,
+	 * because we discovered the current leader:
+	 *
+	 * From Figure 3.1:
+	 *
+	 *   Rules for Servers: Candidates: if AppendEntries RPC is received
+	 * from new leader: convert to follower.
+	 *
+	 * From Section 3.4:
+	 *
+	 *   While waiting for votes, a candidate may receive an AppendEntries
+	 * RPC from another server claiming to be leader. If the leader's term
+	 *   (included in its RPC) is at least as large as the candidate's
+	 * current term, then the candidate recognizes the leader as legitimate
+	 * and returns to follower state. If the term in the RPC is smaller than
+	 * the candidate's current term, then the candidate rejects the RPC and
+	 *   continues in candidate state.
+	 *
+	 * From state diagram in Figure 3.3:
+	 *
+	 *   [candidate]: discovers current leader -> [follower]
+	 *
+	 * Note that it should not be possible for us to be in leader state,
+	 * because the leader that is sending us the request should have either
+	 * a lower term (and in that case we reject the request above), or a
+	 * higher term (and in that case we step down). It can't have the same
+	 * term because at most one leader can be elected at any given term.
+	 */
+	assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE);
+	assert(r->current_term == args->term);
+
+	if (r->state == RAFT_CANDIDATE) {
+		/* The current term and the peer one must match, otherwise we
+		 * would have either rejected the request or stepped down to
+		 * followers. */
+		assert(match == 0);
+		tracef("discovered leader -> step down ");
+		convertToFollower(r);
+	}
+
+	assert(r->state == RAFT_FOLLOWER);
+
+	/* Update current leader because the term in this AppendEntries RPC is
+	 * up to date. */
+	rv = recvUpdateLeader(r, id, address);
+	if (rv != 0) {
+		return rv;
+	}
+
+	/* Reset the election timer. */
+	r->election_timer_start = r->io->time(r->io);
+
+	/* If we are installing a snapshot, ignore these entries. TODO: we
+	 * should do something smarter, e.g. buffering the entries in the I/O
+	 * backend, which should be in charge of serializing everything. */
+	if (replicationInstallSnapshotBusy(r) && args->n_entries > 0) {
+		tracef("ignoring AppendEntries RPC during snapshot install");
+		entryBatchesDestroy(args->entries, args->n_entries);
+		return 0;
+	}
+
+	rv = replicationAppend(r, args, &result->rejected, &async);
+	if (rv != 0) {
+		return rv;
+	}
+
+	if (async) {
+		return 0;
+	}
+
+	/* Echo back to the leader the point that we reached. */
+	result->last_log_index = r->last_stored;
+
+reply:
+	result->term = r->current_term;
+
+	/* Free the entries batch, if any. */
+	if (args->n_entries > 0 && args->entries[0].batch != NULL) {
+		raft_free(args->entries[0].batch);
+	}
+
+	if (args->entries != NULL) {
+		raft_free(args->entries);
+	}
+
+	message.type = RAFT_IO_APPEND_ENTRIES_RESULT;
+	message.server_id = id;
+	message.server_address = address;
+
+	req = RaftHeapMalloc(sizeof *req);
+	if (req == NULL) {
+		return RAFT_NOMEM;
+	}
+	req->data = r;
+
+	rv = r->io->send(r->io, req, &message, recvSendAppendEntriesResultCb);
+	if (rv != 0) {
+		raft_free(req);
+		return rv;
+	}
+
+	return 0;
+}
+
+#undef tracef
diff --git a/src/raft/recv_append_entries.h b/src/raft/recv_append_entries.h
new file mode 100644
index 000000000..5b674860f
--- /dev/null
+++ b/src/raft/recv_append_entries.h
@@ -0,0 +1,14 @@
+/* Receive an AppendEntries message. */
+
+#ifndef RECV_APPEND_ENTRIES_H_
+#define RECV_APPEND_ENTRIES_H_
+
+#include "../raft.h"
+
+/* Process an AppendEntries RPC from the given server. */
+int recvAppendEntries(struct raft *r,
+		      raft_id id,
+		      const char *address,
+		      const struct raft_append_entries *args);
+
+#endif /* RECV_APPEND_ENTRIES_H_ */
diff --git a/src/raft/recv_append_entries_result.c b/src/raft/recv_append_entries_result.c
new file mode 100644
index 000000000..ddef54f14
--- /dev/null
+++ b/src/raft/recv_append_entries_result.c
@@ -0,0 +1,75 @@
+#include "recv_append_entries_result.h"
+#include "../tracing.h"
+#include "assert.h"
+#include "configuration.h"
+#include "recv.h"
+#include "replication.h"
+
+int recvAppendEntriesResult(struct raft *r,
+			    const raft_id id,
+			    const char *address,
+			    const struct raft_append_entries_result *result)
+{
+	int match;
+	const struct raft_server *server;
+	int rv;
+
+	assert(r != NULL);
+	assert(id > 0);
+	assert(address != NULL);
+	assert(result != NULL);
+
+	tracef(
+	    "self:%llu from:%llu@%s last_log_index:%llu rejected:%llu "
+	    "term:%llu",
+	    r->id, id, address, result->last_log_index, result->rejected,
+	    result->term);
+
+	if (r->state != RAFT_LEADER) {
+		tracef("local server is not leader -> ignore");
+		return 0;
+	}
+
+	rv = recvEnsureMatchingTerms(r, result->term, &match);
+	if (rv != 0) {
+		return rv;
+	}
+
+	if (match < 0) {
+		tracef("local term is higher -> ignore ");
+		return 0;
+	}
+
+	/* If we have stepped down, abort here.
+	 *
+	 * From Figure 3.1:
+	 *
+	 *   [Rules for Servers] All Servers: If RPC request or response
+	 * contains term T > currentTerm: set currentTerm = T, convert to
+	 * follower.
+	 */
+	if (match > 0) {
+		assert(r->state == RAFT_FOLLOWER);
+		return 0;
+	}
+
+	assert(result->term == r->current_term);
+
+	/* Ignore responses from servers that have been removed */
+	server = configurationGet(&r->configuration, id);
+	if (server == NULL) {
+		tracef("unknown server -> ignore");
+		return 0;
+	}
+
+	/* Update the progress of this server, possibly sending further entries.
+	 */
+	rv = replicationUpdate(r, server, result);
+	if (rv != 0) {
+		return rv;
+	}
+
+	return 0;
+}
+
+#undef tracef
diff --git a/src/raft/recv_append_entries_result.h b/src/raft/recv_append_entries_result.h
new file mode 100644
index 000000000..8cf8524ac
--- /dev/null
+++ b/src/raft/recv_append_entries_result.h
@@ -0,0 +1,14 @@
+/* Receive an AppendEntries result message. */
+
+#ifndef RECV_APPEND_ENTRIES_RESULT_H_
+#define RECV_APPEND_ENTRIES_RESULT_H_
+
+#include "../raft.h"
+
+/* Process an AppendEntries RPC result from the given server. */
+int recvAppendEntriesResult(struct raft *r,
+			    raft_id id,
+			    const char *address,
+			    const struct raft_append_entries_result *result);
+
+#endif /* RECV_APPEND_ENTRIES_RESULT_H_ */
diff --git a/src/raft/recv_install_snapshot.c b/src/raft/recv_install_snapshot.c
new file mode 100644
index 000000000..d3e1493a2
--- /dev/null
+++ b/src/raft/recv_install_snapshot.c
@@ -0,0 +1,109 @@
+#include "recv_install_snapshot.h"
+
+#include "../tracing.h"
+#include "assert.h"
+#include "convert.h"
+#include "flags.h"
+#include "log.h"
+#include "recv.h"
+#include "replication.h"
+
+static void installSnapshotSendCb(struct raft_io_send *req, int status)
+{
+	(void)status;
+	raft_free(req);
+}
+
+int recvInstallSnapshot(struct raft *r,
+			const raft_id id,
+			const char *address,
+			struct raft_install_snapshot *args)
+{
+	struct raft_io_send *req;
+	struct raft_message message;
+	struct raft_append_entries_result *result =
+	    &message.append_entries_result;
+	int rv;
+	int match;
+	bool async;
+
+	assert(address != NULL);
+	tracef(
+	    "self:%llu from:%llu@%s conf_index:%llu last_index:%llu "
+	    "last_term:%llu "
+	    "term:%llu",
+	    r->id, id, address, args->conf_index, args->last_index,
+	    args->last_term, args->term);
+
+	result->rejected = args->last_index;
+	result->last_log_index = logLastIndex(r->log);
+	result->version = RAFT_APPEND_ENTRIES_RESULT_VERSION;
+	result->features = RAFT_DEFAULT_FEATURE_FLAGS;
+
+	rv = recvEnsureMatchingTerms(r, args->term, &match);
+	if (rv != 0) {
+		return rv;
+	}
+
+	if (match < 0) {
+		tracef("local term is higher -> reject ");
+		goto reply;
+	}
+
+	/* TODO: this logic duplicates the one in the AppendEntries handler */
+	assert(r->state == RAFT_FOLLOWER || r->state == RAFT_CANDIDATE);
+	assert(r->current_term == args->term);
+	if (r->state == RAFT_CANDIDATE) {
+		assert(match == 0);
+		tracef("discovered leader -> step down ");
+		convertToFollower(r);
+	}
+
+	rv = recvUpdateLeader(r, id, address);
+	if (rv != 0) {
+		return rv;
+	}
+	r->election_timer_start = r->io->time(r->io);
+
+	rv = replicationInstallSnapshot(r, args, &result->rejected, &async);
+	if (rv != 0) {
+		tracef("replicationInstallSnapshot failed %d", rv);
+		return rv;
+	}
+
+	if (async) {
+		return 0;
+	}
+
+	if (result->rejected == 0) {
+		/* Echo back to the leader the point that we reached. */
+		result->last_log_index = args->last_index;
+	}
+
+reply:
+	result->term = r->current_term;
+
+	/* Free the snapshot data. */
+	raft_configuration_close(&args->conf);
+	raft_free(args->data.base);
+
+	message.type = RAFT_IO_APPEND_ENTRIES_RESULT;
+	message.server_id = id;
+	message.server_address = address;
+
+	req = raft_malloc(sizeof *req);
+	if (req == NULL) {
+		return RAFT_NOMEM;
+	}
+	req->data = r;
+
+	rv = r->io->send(r->io, req, &message, installSnapshotSendCb);
+	if (rv != 0) {
+		raft_free(req);
+		return rv;
+	}
+
+	return 0;
+}
+
+#undef tracef
diff --git a/src/raft/recv_install_snapshot.h b/src/raft/recv_install_snapshot.h
new file mode 100644
index 000000000..957c605b2
--- /dev/null
+++ b/src/raft/recv_install_snapshot.h
@@ -0,0 +1,14 @@
+/* InstallSnapshot RPC handlers. */
+
+#ifndef RECV_INSTALL_SNAPSHOT_H_
+#define RECV_INSTALL_SNAPSHOT_H_
+
+#include "../raft.h"
+
+/* Process an InstallSnapshot RPC from the given server. */
+int recvInstallSnapshot(struct raft *r,
+			raft_id id,
+			const char *address,
+			struct raft_install_snapshot *args);
+
+#endif /* RECV_INSTALL_SNAPSHOT_H_ */
diff --git a/src/raft/recv_request_vote.c b/src/raft/recv_request_vote.c
new file mode 100644
index 000000000..f51742869
--- /dev/null
+++ b/src/raft/recv_request_vote.c
@@ -0,0 +1,150 @@
+#include "recv_request_vote.h"
+
+#include "../tracing.h"
+#include "assert.h"
+#include "election.h"
+#include "recv.h"
+#include "replication.h"
+
+static void requestVoteSendCb(struct raft_io_send *req, int status)
+{
+	(void)status;
+	raft_free(req);
+}
+
+int recvRequestVote(struct raft *r,
+		    const raft_id id,
+		    const char *address,
+		    const struct raft_request_vote *args)
+{
+	struct raft_io_send *req;
+	struct raft_message message;
+	struct raft_request_vote_result *result = &message.request_vote_result;
+	bool has_leader;
+	int match;
+	int rv;
+
+	assert(r != NULL);
+	assert(id > 0);
+	assert(args != NULL);
+
+	tracef(
+	    "self:%llu from:%llu@%s candidate_id:%llu disrupt_leader:%d "
+	    "last_log_index:%llu "
+	    "last_log_term:%llu pre_vote:%d term:%llu",
+	    r->id, id, address, args->candidate_id, args->disrupt_leader,
+	    args->last_log_index, args->last_log_term, args->pre_vote,
+	    args->term);
+	result->vote_granted = false;
+	result->pre_vote = args->pre_vote;
+	result->version = RAFT_REQUEST_VOTE_RESULT_VERSION;
+
+	/* Reject the request if we have a leader.
+	 *
+	 * From Section 4.2.3:
+	 *
+	 *   [Removed] servers should not be able to disrupt a leader whose
+	 * cluster is receiving heartbeats. [...] If a server receives a
+	 * RequestVote request within the minimum election timeout of hearing
+	 * from a current leader, it does not update its term or grant its vote
+	 *
+	 * From Section 4.2.3:
+	 *
+	 *   This change conflicts with the leadership transfer mechanism as
+	 *   described in Chapter 3, in which a server legitimately starts an
+	 *   election without waiting an election timeout. In that case,
+	 * RequestVote messages should be processed by other servers even when
+	 * they believe a current cluster leader exists. Those RequestVote
+	 * requests can include a special flag to indicate this behavior ("I
+	 * have permission to disrupt the leader - it told me to!").
+	 */
+	has_leader = r->state == RAFT_LEADER ||
+		     (r->state == RAFT_FOLLOWER &&
+		      r->follower_state.current_leader.id != 0);
+	if (has_leader && !args->disrupt_leader) {
+		tracef("local server has a leader -> reject ");
+		goto reply;
+	}
+
+	/* If this is a pre-vote request, don't actually increment our term or
+	 * persist the vote. */
+	if (args->pre_vote) {
+		recvCheckMatchingTerms(r, args->term, &match);
+	} else {
+		rv = recvEnsureMatchingTerms(r, args->term, &match);
+		if (rv != 0) {
+			return rv;
+		}
+	}
+
+	/* Reject the request if we are installing a snapshot.
+	 *
+	 * This condition should only be reachable if the disrupt_leader flag is
+	 * set, since otherwise we wouldn't have passed the have_leader check
+	 * above (follower state is not cleared while a snapshot is being
+	 * installed). */
+	if (replicationInstallSnapshotBusy(r)) {
+		tracef("installing snapshot -> reject (disrupt_leader:%d)",
+		       (int)args->disrupt_leader);
+		goto reply;
+	}
+
+	/* From Figure 3.1:
+	 *
+	 *   RequestVote RPC: Receiver implementation: Reply false if
+	 *   term < currentTerm.
+	 *
+	 */
+	if (match < 0) {
+		tracef("local term is higher -> reject ");
+		goto reply;
+	}
+
+	/* Unless this is a pre-vote request, at this point our term must be the
+	 * same as the request term (otherwise we would have rejected the
+	 * request or bumped our term). */
+	if (!args->pre_vote) {
+		tracef("no pre_vote: current_term:%llu term:%llu",
+		       r->current_term, args->term);
+		assert(r->current_term == args->term);
+	}
+
+	rv = electionVote(r, args, &result->vote_granted);
+	if (rv != 0) {
+		return rv;
+	}
+
+reply:
+	result->term = r->current_term;
+	/* Nodes don't update their term when seeing a Pre-Vote RequestVote RPC.
+	 * To prevent the candidate from ignoring the response of this node if
+	 * it has a smaller term than the candidate, we include the term of the
+	 * request. The smaller term can occur if this node was partitioned from
+	 * the cluster and has reestablished connectivity. This prevents a
+	 * cluster deadlock when a majority of the nodes is online, but they
+	 * fail to establish quorum because the vote of a former partitioned
+	 * node with a smaller term is needed for majority.*/
+	if (args->pre_vote) {
+		result->term = args->term;
+	}
+
+	message.type = RAFT_IO_REQUEST_VOTE_RESULT;
+	message.server_id = id;
+	message.server_address = address;
+
+	req = raft_malloc(sizeof *req);
+	if (req == NULL) {
+		return RAFT_NOMEM;
+	}
+	req->data = r;
+
+	rv = r->io->send(r->io, req, &message, requestVoteSendCb);
+	if (rv != 0) {
+		raft_free(req);
+		return rv;
+	}
+
+	return 0;
+}
+
+#undef tracef
diff --git a/src/raft/recv_request_vote.h b/src/raft/recv_request_vote.h
new file mode 100644
index 000000000..9f2583e33
--- /dev/null
+++ b/src/raft/recv_request_vote.h
@@ -0,0 +1,14 @@
+/* RequestVote RPC handler. */
+
+#ifndef RECV_REQUEST_VOTE_H_
+#define RECV_REQUEST_VOTE_H_
+
+#include "../raft.h"
+
+/* Process a RequestVote RPC from the given server. */
+int recvRequestVote(struct raft *r,
+		    raft_id id,
+		    const char *address,
+		    const struct raft_request_vote *args);
+
+#endif /* RECV_REQUEST_VOTE_H_ */
diff --git a/src/raft/recv_request_vote_result.c b/src/raft/recv_request_vote_result.c
new file mode 100644
index 000000000..ca7ece487
--- /dev/null
+++ b/src/raft/recv_request_vote_result.c
@@ -0,0 +1,154 @@
+#include "recv_request_vote_result.h"
+
+#include "../tracing.h"
+#include "assert.h"
+#include "configuration.h"
+#include "convert.h"
+#include "election.h"
+#include "recv.h"
+#include "replication.h"
+
+int recvRequestVoteResult(struct raft *r,
+			  raft_id id,
+			  const char *address,
+			  const struct raft_request_vote_result *result)
+{
+	size_t votes_index;
+	int match;
+	int rv;
+
+	(void)address;
+
+	assert(r != NULL);
+	assert(id > 0);
+
+	tracef(
+	    "self:%llu from:%llu@%s term:%llu vote_granted:%d pre_vote:%d "
+	    "version:%d",
+	    r->id, id, address, result->term, result->vote_granted,
+	    result->pre_vote, result->version);
+	votes_index = configurationIndexOfVoter(&r->configuration, id);
+	if (votes_index == r->configuration.n) {
+		tracef("non-voting or unknown server -> reject");
+		return 0;
+	}
+
+	/* Ignore responses if we are not candidate anymore */
+	if (r->state != RAFT_CANDIDATE) {
+		tracef("local server is not candidate -> ignore");
+		return 0;
+	}
+
+	/* If we're in the pre-vote phase, don't actually increment our term
+	 * right now (we'll do it later, if we start the second phase), and also
+	 * don't step down if the peer is just one term ahead (this is okay as
+	 * in the request we sent our current term plus one). */
+	if (r->candidate_state.in_pre_vote) {
+		recvCheckMatchingTerms(r, result->term, &match);
+	} else {
+		rv = recvEnsureMatchingTerms(r, result->term, &match);
+		if (rv != 0) {
+			return rv;
+		}
+	}
+
+	/* Converted to follower as a result of seeing a higher term. */
+	if (r->state != RAFT_CANDIDATE) {
+		tracef("no longer candidate -> ignore");
+		return 0;
+	}
+
+	if (match < 0) {
+		/* If the term in the result is older than ours, this is an old
+		 * message we should ignore, because the node who voted for us
+		 * would have obtained our term.  This happens if the network is
+		 * pretty choppy. */
+		tracef("local term is higher -> ignore");
+		return 0;
+	}
+
+	/* Avoid counting pre-vote votes as regular votes. */
+	if (result->version > 1 && result->pre_vote &&
+	    !r->candidate_state.in_pre_vote) {
+		tracef("receive stale pre-vote response -> ignore");
+		return 0;
+	}
+
+	/* This can happen when a candidate wins a pre-vote, bumps its term,
+	 * sends real RequestVote RPCs, crashes, comes online, starts a pre-vote
+	 * and then receives the response to the RequestVote RPC it sent
+	 * out before crashing. */
+	if (result->version > 1 && !result->pre_vote &&
+	    r->candidate_state.in_pre_vote) {
+		tracef("receive vote response during pre-vote -> ignore");
+		return 0;
+	}
+
+	/* If we're in the pre-vote phase, check that the peer's is at most one
+	 * term ahead (possibly stepping down). If we're the actual voting
+	 * phase, we expect our term must to be the same as the response term
+	 * (otherwise we would have either ignored the result bumped our term).
+	 */
+	if (r->candidate_state.in_pre_vote) {
+		if (match > 0) {
+			if (result->term > r->current_term + 1) {
+				assert(!result->vote_granted);
+				rv = recvBumpCurrentTerm(r, result->term);
+				return rv;
+			}
+		}
+	} else {
+		assert(result->term == r->current_term);
+	}
+
+	/* If the vote was granted and we reached quorum, convert to leader.
+	 *
+	 * From Figure 3.1:
+	 *
+	 *   If votes received from majority of severs: become leader.
+	 *
+	 * From state diagram in Figure 3.3:
+	 *
+	 *   [candidate]: receives votes from majority of servers -> [leader]
+	 *
+	 * From Section 3.4:
+	 *
+	 *   A candidate wins an election if it receives votes from a majority
+	 * of the servers in the full cluster for the same term. Each server
+	 * will vote for at most one candidate in a given term, on a
+	 *   firstcome-first-served basis [...]. Once a candidate wins an
+	 * election, it becomes leader.
+	 */
+	if (result->vote_granted) {
+		if (electionTally(r, votes_index)) {
+			if (r->candidate_state.in_pre_vote) {
+				tracef(
+				    "votes quorum reached -> pre-vote "
+				    "successful");
+				r->candidate_state.in_pre_vote = false;
+				rv = electionStart(r);
+				if (rv != 0) {
+					return rv;
+				}
+			} else {
+				tracef(
+				    "votes quorum reached -> convert to "
+				    "leader");
+				rv = convertToLeader(r);
+				if (rv != 0) {
+					return rv;
+				}
+				/* Send initial heartbeat. */
+				replicationHeartbeat(r);
+			}
+		} else {
+			tracef("votes quorum not reached");
+		}
+	} else {
+		tracef("vote was not granted");
+	}
+
+	return 0;
+}
+
+#undef tracef
diff --git a/src/raft/recv_request_vote_result.h b/src/raft/recv_request_vote_result.h
new file mode 100644
index 000000000..344f3ef53
--- /dev/null
+++ b/src/raft/recv_request_vote_result.h
@@ -0,0 +1,14 @@
+/* Receive a RequestVote result. */
+
+#ifndef RECV_REQUEST_VOTE_RESULT_H_
+#define RECV_REQUEST_VOTE_RESULT_H_
+
+#include "../raft.h"
+
+/* Process a RequestVote RPC result from the given server. */
+int recvRequestVoteResult(struct raft *r,
+			  raft_id id,
+			  const char *address,
+			  const struct raft_request_vote_result *result);
+
+#endif /* RAFT_RECV_REQUEST_VOTE_RESULT_H_ */
diff --git a/src/raft/recv_timeout_now.c b/src/raft/recv_timeout_now.c
new file mode 100644
index 000000000..c503c7600
--- /dev/null
+++ b/src/raft/recv_timeout_now.c
@@ -0,0 +1,81 @@
+#include "recv_timeout_now.h"
+
+#include "../tracing.h"
+#include "assert.h"
+#include "configuration.h"
+#include "convert.h"
+#include "log.h"
+#include "recv.h"
+
+int recvTimeoutNow(struct raft *r,
+		   const raft_id id,
+		   const char *address,
+		   const struct raft_timeout_now *args)
+{
+	const struct raft_server *local_server;
+	raft_index local_last_index;
+	raft_term local_last_term;
+	int match;
+	int rv;
+
+	assert(r != NULL);
+	assert(id > 0);
+	assert(args != NULL);
+
+	(void)address;
+
+	tracef(
+	    "self:%llu from:%llu@%s last_log_index:%llu last_log_term:%llu "
+	    "term:%llu",
+	    r->id, id, address, args->last_log_index, args->last_log_term,
+	    args->term);
+	/* Ignore the request if we are not voters. */
+	local_server = configurationGet(&r->configuration, r->id);
+	if (local_server == NULL || local_server->role != RAFT_VOTER) {
+		tracef("non-voter");
+		return 0;
+	}
+
+	/* Ignore the request if we are not follower, or we have different
+	 * leader. */
+	if (r->state != RAFT_FOLLOWER ||
+	    r->follower_state.current_leader.id != id) {
+		tracef("Ignore - r->state:%d current_leader.id:%llu", r->state,
+		       r->follower_state.current_leader.id);
+		return 0;
+	}
+
+	/* Possibly update our term. Ignore the request if it turns out we have
+	 * a higher term. */
+	rv = recvEnsureMatchingTerms(r, args->term, &match);
+	if (rv != 0) {
+		return rv;
+	}
+	if (match < 0) {
+		return 0;
+	}
+
+	/* Ignore the request if we our log is not up-to-date. */
+	local_last_index = logLastIndex(r->log);
+	local_last_term = logLastTerm(r->log);
+	if (local_last_index != args->last_log_index ||
+	    local_last_term != args->last_log_term) {
+		return 0;
+	}
+
+	/* Finally, ignore the request if we're working on persisting some
+	 * entries. */
+	if (r->follower_state.append_in_flight_count > 0) {
+		return 0;
+	}
+
+	/* Convert to candidate and start a new election. */
+	rv = convertToCandidate(r, true /* disrupt leader */);
+	if (rv != 0) {
+		return rv;
+	}
+
+	return 0;
+}
+
+#undef tracef
diff --git a/src/raft/recv_timeout_now.h b/src/raft/recv_timeout_now.h
new file mode 100644
index 000000000..5678c290c
--- /dev/null
+++ b/src/raft/recv_timeout_now.h
@@ -0,0 +1,14 @@
+/* Receive a TimeoutNow message. */
+
+#ifndef RECV_TIMEOUT_NOW_H_
+#define RECV_TIMEOUT_NOW_H_
+
+#include "../raft.h"
+
+/* Process a TimeoutNow RPC from the given server. */
+int recvTimeoutNow(struct raft *r,
+		   raft_id id,
+		   const char *address,
+		   const struct raft_timeout_now *args);
+
+#endif /* RECV_TIMEOUT_NOW_H_ */
diff --git a/src/raft/replication.c b/src/raft/replication.c
new file mode 100644
index 000000000..8310feb8b
--- /dev/null
+++ b/src/raft/replication.c
@@ -0,0 +1,1837 @@
+#include <string.h>
+
+#include "assert.h"
+#include "configuration.h"
+#include "convert.h"
+#include "entry.h"
+#ifdef __GLIBC__
+#include "error.h"
+#endif
+#include "../tracing.h"
+#include "err.h"
+#include "flags.h"
+#include "heap.h"
+#include "lifecycle.h"
+#include "log.h"
+#include "membership.h"
+#include "progress.h"
+#include "queue.h"
+#include "replication.h"
+#include "request.h"
+#include "snapshot.h"
+
+#ifndef max
+#define max(a, b) ((a) < (b) ? (b) : (a))
+#endif
+
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+/* Context of a RAFT_IO_APPEND_ENTRIES request that was submitted with
+ * raft_io_>send(). */
+struct sendAppendEntries
+{
+	struct raft *raft;        /* Instance sending the entries. */
+	struct raft_io_send send; /* Underlying I/O send request. */
+	raft_index index;         /* Index of the first entry in the request. */
+	struct raft_entry *entries; /* Entries referenced in the request. */
+	unsigned n;                 /* Length of the entries array. */
+	raft_id server_id;          /* Destination server. */
+};
+
+/* Callback invoked after request to send an AppendEntries RPC has completed. */
+static void sendAppendEntriesCb(struct raft_io_send *send, const int status)
+{
+	struct sendAppendEntries *req = send->data;
+	struct raft *r = req->raft;
+	unsigned i = configurationIndexOf(&r->configuration, req->server_id);
+
+	if (r->state == RAFT_LEADER && i < r->configuration.n) {
+		if (status != 0) {
+			tracef(
+			    "failed to send append entries to server %llu: %s",
+			    req->server_id, raft_strerror(status));
+			/* Go back to probe mode. */
+			progressToProbe(r, i);
+		}
+	}
+
+	/* Tell the log that we're done referencing these entries. */
+	logRelease(r->log, req->index, req->entries, req->n);
+	raft_free(req);
+}
+
+/* Send an AppendEntries message to the i'th server, including all log entries
+ * from the given point onwards. */
+static int sendAppendEntries(struct raft *r,
+			     const unsigned i,
+			     const raft_index prev_index,
+			     const raft_term prev_term)
+{
+	struct raft_server *server = &r->configuration.servers[i];
+	struct raft_message message;
+	struct raft_append_entries *args = &message.append_entries;
+	struct sendAppendEntries *req;
+	raft_index next_index = prev_index + 1;
+	int rv;
+
+	args->term = r->current_term;
+	args->prev_log_index = prev_index;
+	args->prev_log_term = prev_term;
+
+	/* TODO: implement a limit to the total size of the entries being sent
+	 */
+	rv = logAcquire(r->log, next_index, &args->entries, &args->n_entries);
+	if (rv != 0) {
+		goto err;
+	}
+
+	/* From Section 3.5:
+	 *
+	 *   The leader keeps track of the highest index it knows to be
+	 * committed, and it includes that index in future AppendEntries RPCs
+	 * (including heartbeats) so that the other servers eventually find out.
+	 * Once a follower learns that a log entry is committed, it applies the
+	 * entry to its local state machine (in log order)
+	 */
+	args->leader_commit = r->commit_index;
+
+	tracef(
+	    "send %u entries starting at %llu to server %llu (last index %llu)",
+	    args->n_entries, args->prev_log_index, server->id,
+	    logLastIndex(r->log));
+
+	message.type = RAFT_IO_APPEND_ENTRIES;
+	message.server_id = server->id;
+	message.server_address = server->address;
+
+	req = raft_malloc(sizeof *req);
+	if (req == NULL) {
+		rv = RAFT_NOMEM;
+		goto err_after_entries_acquired;
+	}
+	req->raft = r;
+	req->index = args->prev_log_index + 1;
+	req->entries = args->entries;
+	req->n = args->n_entries;
+	req->server_id = server->id;
+
+	req->send.data = req;
+	rv = r->io->send(r->io, &req->send, &message, sendAppendEntriesCb);
+	if (rv != 0) {
+		goto err_after_req_alloc;
+	}
+
+	if (progressState(r, i) == PROGRESS__PIPELINE) {
+		/* Optimistically update progress. */
+		progressOptimisticNextIndex(r, i, req->index + req->n);
+	}
+
+	progressUpdateLastSend(r, i);
+	return 0;
+
+err_after_req_alloc:
+	raft_free(req);
+err_after_entries_acquired:
+	logRelease(r->log, next_index, args->entries, args->n_entries);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+/* Context of a RAFT_IO_INSTALL_SNAPSHOT request that was submitted with
+ * raft_io_>send(). */
+struct sendInstallSnapshot
+{
+	struct raft *raft;               /* Instance sending the snapshot. */
+	struct raft_io_snapshot_get get; /* Snapshot get request. */
+	struct raft_io_send send;        /* Underlying I/O send request. */
+	struct raft_snapshot *snapshot;  /* Snapshot to send. */
+	raft_id server_id;               /* Destination server. */
+};
+
+static void sendInstallSnapshotCb(struct raft_io_send *send, int status)
+{
+	struct sendInstallSnapshot *req = send->data;
+	struct raft *r = req->raft;
+	const struct raft_server *server;
+
+	server = configurationGet(&r->configuration, req->server_id);
+
+	if (status != 0) {
+		tracef("send install snapshot: %s", raft_strerror(status));
+		if (r->state == RAFT_LEADER && server != NULL) {
+			unsigned i;
+			i = configurationIndexOf(&r->configuration,
+						 req->server_id);
+			progressAbortSnapshot(r, i);
+		}
+	}
+
+	snapshotClose(req->snapshot);
+	raft_free(req->snapshot);
+	raft_free(req);
+}
+
+static void sendSnapshotGetCb(struct raft_io_snapshot_get *get,
+			      struct raft_snapshot *snapshot,
+			      int status)
+{
+	struct sendInstallSnapshot *req = get->data;
+	struct raft *r = req->raft;
+	struct raft_message message;
+	struct raft_install_snapshot *args = &message.install_snapshot;
+	const struct raft_server *server = NULL;
+	bool progress_state_is_snapshot = false;
+	unsigned i = 0;
+	int rv;
+
+	if (status != 0) {
+		tracef("get snapshot %s", raft_strerror(status));
+		goto abort;
+	}
+	if (r->state != RAFT_LEADER) {
+		goto abort_with_snapshot;
+	}
+
+	server = configurationGet(&r->configuration, req->server_id);
+
+	if (server == NULL) {
+		/* Probably the server was removed in the meantime. */
+		goto abort_with_snapshot;
+	}
+
+	i = configurationIndexOf(&r->configuration, req->server_id);
+	progress_state_is_snapshot = progressState(r, i) == PROGRESS__SNAPSHOT;
+
+	if (!progress_state_is_snapshot) {
+		/* Something happened in the meantime. */
+		goto abort_with_snapshot;
+	}
+
+	assert(snapshot->n_bufs == 1);
+
+	message.type = RAFT_IO_INSTALL_SNAPSHOT;
+	message.server_id = server->id;
+	message.server_address = server->address;
+
+	args->term = r->current_term;
+	args->last_index = snapshot->index;
+	args->last_term = snapshot->term;
+	args->conf_index = snapshot->configuration_index;
+	args->conf = snapshot->configuration;
+	args->data = snapshot->bufs[0];
+
+	req->snapshot = snapshot;
+	req->send.data = req;
+
+	tracef("sending snapshot with last index %llu to %llu", snapshot->index,
+	       server->id);
+
+	rv = r->io->send(r->io, &req->send, &message, sendInstallSnapshotCb);
+	if (rv != 0) {
+		goto abort_with_snapshot;
+	}
+
+	goto out;
+
+abort_with_snapshot:
+	snapshotClose(snapshot);
+	raft_free(snapshot);
+abort:
+	if (r->state == RAFT_LEADER && server != NULL &&
+	    progress_state_is_snapshot) {
+		progressAbortSnapshot(r, i);
+	}
+	raft_free(req);
+out:
+	return;
+}
+
+/* Send the latest snapshot to the i'th server */
+static int sendSnapshot(struct raft *r, const unsigned i)
+{
+	struct raft_server *server = &r->configuration.servers[i];
+	struct sendInstallSnapshot *request;
+	int rv;
+
+	progressToSnapshot(r, i);
+
+	request = raft_malloc(sizeof *request);
+	if (request == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+	request->raft = r;
+	request->server_id = server->id;
+	request->get.data = request;
+
+	/* TODO: make sure that the I/O implementation really returns the latest
+	 * snapshot *at this time* and not any snapshot that might be stored at
+	 * a later point. Otherwise the progress snapshot_index would be wrong.
+	 */
+	rv = r->io->snapshot_get(r->io, &request->get, sendSnapshotGetCb);
+	if (rv != 0) {
+		goto err_after_req_alloc;
+	}
+
+	progressUpdateSnapshotLastSend(r, i);
+	return 0;
+
+err_after_req_alloc:
+	raft_free(request);
+err:
+	progressAbortSnapshot(r, i);
+	assert(rv != 0);
+	return rv;
+}
+
+int replicationProgress(struct raft *r, unsigned i)
+{
+	struct raft_server *server = &r->configuration.servers[i];
+	bool progress_state_is_snapshot =
+	    progressState(r, i) == PROGRESS__SNAPSHOT;
+	raft_index snapshot_index = logSnapshotIndex(r->log);
+	raft_index next_index = progressNextIndex(r, i);
+	raft_index prev_index;
+	raft_term prev_term;
+
+	assert(r->state == RAFT_LEADER);
+	assert(server->id != r->id);
+	assert(next_index >= 1);
+
+	if (!progressShouldReplicate(r, i)) {
+		return 0;
+	}
+
+	/* From Section 3.5:
+	 *
+	 *   When sending an AppendEntries RPC, the leader includes the index
+	 * and term of the entry in its log that immediately precedes the new
+	 *   entries. If the follower does not find an entry in its log with the
+	 *   same index and term, then it refuses the new entries. The
+	 * consistency check acts as an induction step: the initial empty state
+	 * of the logs satisfies the Log Matching Property, and the consistency
+	 * check preserves the Log Matching Property whenever logs are extended.
+	 * As a result, whenever AppendEntries returns successfully, the leader
+	 * knows that the follower's log is identical to its own log up through
+	 * the new entries (Log Matching Property in Figure 3.2).
+	 */
+	if (next_index == 1) {
+		/* We're including the very first entry, so prevIndex and
+		 * prevTerm are null. If the first entry is not available
+		 * anymore, send the last snapshot if we're not already sending
+		 * one. */
+		if (snapshot_index > 0 && !progress_state_is_snapshot) {
+			raft_index last_index = logLastIndex(r->log);
+			assert(last_index > 0); /* The log can't be empty */
+			goto send_snapshot;
+		}
+		prev_index = 0;
+		prev_term = 0;
+	} else {
+		/* Set prevIndex and prevTerm to the index and term of the entry
+		 * at next_index - 1. */
+		prev_index = next_index - 1;
+		prev_term = logTermOf(r->log, prev_index);
+		/* If the entry is not anymore in our log, send the last
+		 * snapshot if we're not doing so already. */
+		if (prev_term == 0 && !progress_state_is_snapshot) {
+			assert(prev_index < snapshot_index);
+			tracef("missing entry at index %lld -> send snapshot",
+			       prev_index);
+			goto send_snapshot;
+		}
+	}
+
+	/* Send empty AppendEntries RPC when installing a snaphot */
+	if (progress_state_is_snapshot) {
+		prev_index = logLastIndex(r->log);
+		prev_term = logLastTerm(r->log);
+	}
+
+	return sendAppendEntries(r, i, prev_index, prev_term);
+
+send_snapshot:
+	if (progressGetRecentRecv(r, i)) {
+		/* Only send a snapshot when we have heard from the server */
+		return sendSnapshot(r, i);
+	} else {
+		/* Send empty AppendEntries RPC when we haven't heard from the
+		 * server */
+		prev_index = logLastIndex(r->log);
+		prev_term = logLastTerm(r->log);
+		return sendAppendEntries(r, i, prev_index, prev_term);
+	}
+}
+
+/* Possibly trigger I/O requests for newly appended log entries or heartbeats.
+ *
+ * This function loops through all followers and triggers replication on them.
+ *
+ * It must be called only by leaders. */
+static int triggerAll(struct raft *r)
+{
+	unsigned i;
+	int rv;
+
+	assert(r->state == RAFT_LEADER);
+
+	/* Trigger replication for servers we didn't hear from recently. */
+	for (i = 0; i < r->configuration.n; i++) {
+		struct raft_server *server = &r->configuration.servers[i];
+		if (server->id == r->id) {
+			continue;
+		}
+		/* Skip spare servers, unless they're being promoted. */
+		if (server->role == RAFT_SPARE &&
+		    server->id != r->leader_state.promotee_id) {
+			continue;
+		}
+		rv = replicationProgress(r, i);
+		if (rv != 0 && rv != RAFT_NOCONNECTION) {
+			/* This is not a critical failure, let's just log it. */
+			tracef(
+			    "failed to send append entries to server %llu: %s "
+			    "(%d)",
+			    server->id, raft_strerror(rv), rv);
+		}
+	}
+
+	return 0;
+}
+
+int replicationHeartbeat(struct raft *r)
+{
+	return triggerAll(r);
+}
+
+/* Context for a write log entries request that was submitted by a leader. */
+struct appendLeader
+{
+	struct raft *raft; /* Instance that has submitted the request */
+	raft_index index;  /* Index of the first entry in the request. */
+	struct raft_entry *entries; /* Entries referenced in the request. */
+	unsigned n;                 /* Length of the entries array. */
+	struct raft_io_append req;
+};
+
+/* Called after a successful append entries I/O request to update the index of
+ * the last entry stored on disk. Return how many new entries that are still
+ * present in our in-memory log were stored. */
+static size_t updateLastStored(struct raft *r,
+			       raft_index first_index,
+			       struct raft_entry *entries,
+			       size_t n_entries)
+{
+	size_t i;
+
+	/* Check which of these entries is still in our in-memory log */
+	for (i = 0; i < n_entries; i++) {
+		struct raft_entry *entry = &entries[i];
+		raft_index index = first_index + i;
+		raft_term local_term = logTermOf(r->log, index);
+
+		/* If we have no entry at this index, or if the entry we have
+		 * now has a different term, it means that this entry got
+		 * truncated, so let's stop here. */
+		if (local_term == 0 ||
+		    (local_term > 0 && local_term != entry->term)) {
+			break;
+		}
+
+		/* If we do have an entry at this index, its term must match the
+		 * one of the entry we wrote on disk. */
+		assert(local_term != 0 && local_term == entry->term);
+	}
+
+	r->last_stored += i;
+	return i;
+}
+
+/* Get the request matching the given @index and @type, if any.
+ * The type check is skipped when @type == -1. */
+static struct request *getRequest(struct raft *r,
+				  const raft_index index,
+				  int type)
+{
+	queue *head;
+	struct request *req;
+
+	if (r->state != RAFT_LEADER) {
+		return NULL;
+	}
+	QUEUE_FOREACH(head, &r->leader_state.requests)
+	{
+		req = QUEUE_DATA(head, struct request, queue);
+		if (req->index == index) {
+			if (type != -1) {
+				assert(req->type == type);
+			}
+			lifecycleRequestEnd(r, req);
+			return req;
+		}
+	}
+	return NULL;
+}
+
+/* Invoked once a disk write request for new entries has been completed. */
+static void appendLeaderCb(struct raft_io_append *append, int status)
+{
+	struct appendLeader *request = append->data;
+	struct raft *r = request->raft;
+	size_t server_index;
+	raft_index index;
+	int rv;
+
+	tracef("leader: written %u entries starting at %lld: status %d",
+	       request->n, request->index, status);
+
+	/* In case of a failed disk write, if we were the leader creating these
+	 * entries in the first place, truncate our log too (since we have
+	 * appended these entries to it) and fire the request callbacks.
+	 *
+	 * Afterward, convert immediately to follower state, giving the cluster
+	 * a chance to elect another leader that doesn't have a full disk (or
+	 * whatever caused our write error). */
+	if (status != 0) {
+		ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
+		for (unsigned i = 0; i < request->n; i++) {
+			const struct request *req =
+			    getRequest(r, request->index + i, -1);
+			if (!req) {
+				tracef("no request found at index %llu",
+				       request->index + i);
+				continue;
+			}
+			switch (req->type) {
+				case RAFT_COMMAND: {
+					struct raft_apply *apply =
+					    (struct raft_apply *)req;
+					if (apply->cb) {
+						apply->cb(apply, status, NULL);
+					}
+					break;
+				}
+				case RAFT_BARRIER: {
+					struct raft_barrier *barrier =
+					    (struct raft_barrier *)req;
+					if (barrier->cb) {
+						barrier->cb(barrier, status);
+					}
+					break;
+				}
+				case RAFT_CHANGE: {
+					struct raft_change *change =
+					    (struct raft_change *)req;
+					if (change->cb) {
+						change->cb(change, status);
+					}
+					break;
+				}
+				default:
+					tracef(
+					    "unknown request type, shutdown.");
+					assert(false);
+					break;
+			}
+		}
+		goto out;
+	}
+
+	updateLastStored(r, request->index, request->entries, request->n);
+
+	/* If we are not leader anymore, just discard the result. */
+	if (r->state != RAFT_LEADER) {
+		tracef("local server is not leader -> ignore write log result");
+		goto out;
+	}
+
+	/* Only update the next index if we are part of the current
+	 * configuration. The only case where this is not true is when we were
+	 * asked to remove ourselves from the cluster.
+	 *
+	 * From Section 4.2.2:
+	 *
+	 *   there will be a period of time (while it is committing Cnew) when a
+	 *   leader can manage a cluster that does not include itself; it
+	 *   replicates log entries but does not count itself in majorities.
+	 */
+	server_index = configurationIndexOf(&r->configuration, r->id);
+	if (server_index < r->configuration.n) {
+		r->leader_state.progress[server_index].match_index =
+		    r->last_stored;
+	}
+
+	/* Check if we can commit some new entries. */
+	replicationQuorum(r, r->last_stored);
+
+	rv = replicationApply(r);
+	if (rv != 0) {
+		/* TODO: just log the error? */
+	}
+
+out:
+	/* Tell the log that we're done referencing these entries. */
+	logRelease(r->log, request->index, request->entries, request->n);
+	index = request->index;
+	raft_free(request);
+	if (status != 0) {
+		if (index <= logLastIndex(r->log)) {
+			logTruncate(r->log, index);
+		}
+		if (r->state == RAFT_LEADER) {
+			convertToFollower(r);
+		}
+	}
+}
+
+/* Submit a disk write for all entries from the given index onward. */
+static int appendLeader(struct raft *r, raft_index index)
+{
+	struct raft_entry *entries = NULL;
+	unsigned n;
+	struct appendLeader *request;
+	int rv;
+
+	assert(r->state == RAFT_LEADER);
+	assert(index > 0);
+	assert(index > r->last_stored);
+
+	/* Acquire all the entries from the given index onwards. */
+	rv = logAcquire(r->log, index, &entries, &n);
+	if (rv != 0) {
+		goto err;
+	}
+
+	/* We expect this function to be called only when there are actually
+	 * some entries to write. */
+	if (n == 0) {
+		assert(false);
+		tracef("No log entries found at index %llu", index);
+		ErrMsgPrintf(r->errmsg, "No log entries found at index %llu",
+			     index);
+		rv = RAFT_SHUTDOWN;
+		goto err_after_entries_acquired;
+	}
+
+	/* Allocate a new request. */
+	request = raft_malloc(sizeof *request);
+	if (request == NULL) {
+		rv = RAFT_NOMEM;
+		goto err_after_entries_acquired;
+	}
+
+	request->raft = r;
+	request->index = index;
+	request->entries = entries;
+	request->n = n;
+	request->req.data = request;
+
+	rv = r->io->append(r->io, &request->req, entries, n, appendLeaderCb);
+	if (rv != 0) {
+		ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
+		goto err_after_request_alloc;
+	}
+
+	return 0;
+
+err_after_request_alloc:
+	raft_free(request);
+err_after_entries_acquired:
+	logRelease(r->log, index, entries, n);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+int replicationTrigger(struct raft *r, raft_index index)
+{
+	int rv;
+
+	rv = appendLeader(r, index);
+	if (rv != 0) {
+		return rv;
+	}
+
+	return triggerAll(r);
+}
+
+/* Helper to be invoked after a promotion of a non-voting server has been
+ * requested via @raft_assign and that server has caught up with logs.
+ *
+ * This function changes the local configuration marking the server being
+ * promoted as actually voting, appends the a RAFT_CHANGE entry with the new
+ * configuration to the local log and triggers its replication. */
+static int triggerActualPromotion(struct raft *r)
+{
+	raft_index index;
+	raft_term term = r->current_term;
+	size_t server_index;
+	struct raft_server *server;
+	int old_role;
+	int rv;
+
+	assert(r->state == RAFT_LEADER);
+	assert(r->leader_state.promotee_id != 0);
+
+	server_index = configurationIndexOf(&r->configuration,
+					    r->leader_state.promotee_id);
+	assert(server_index < r->configuration.n);
+
+	server = &r->configuration.servers[server_index];
+
+	assert(server->role != RAFT_VOTER);
+
+	/* Update our current configuration. */
+	old_role = server->role;
+	server->role = RAFT_VOTER;
+
+	/* Index of the entry being appended. */
+	index = logLastIndex(r->log) + 1;
+
+	/* Encode the new configuration and append it to the log. */
+	rv = logAppendConfiguration(r->log, term, &r->configuration);
+	if (rv != 0) {
+		goto err;
+	}
+
+	/* Start writing the new log entry to disk and send it to the followers.
+	 */
+	rv = replicationTrigger(r, index);
+	if (rv != 0) {
+		goto err_after_log_append;
+	}
+
+	r->leader_state.promotee_id = 0;
+	r->configuration_uncommitted_index = logLastIndex(r->log);
+
+	return 0;
+
+err_after_log_append:
+	logTruncate(r->log, index);
+
+err:
+	server->role = old_role;
+
+	assert(rv != 0);
+	return rv;
+}
+
+int replicationUpdate(struct raft *r,
+		      const struct raft_server *server,
+		      const struct raft_append_entries_result *result)
+{
+	bool is_being_promoted;
+	raft_index last_index;
+	unsigned i;
+	int rv;
+
+	i = configurationIndexOf(&r->configuration, server->id);
+
+	assert(r->state == RAFT_LEADER);
+	assert(i < r->configuration.n);
+
+	progressMarkRecentRecv(r, i);
+
+	progressSetFeatures(r, i, result->features);
+
+	/* If the RPC failed because of a log mismatch, retry.
+	 *
+	 * From Figure 3.1:
+	 *
+	 *   [Rules for servers] Leaders:
+	 *
+	 *   - If AppendEntries fails because of log inconsistency:
+	 *     decrement nextIndex and retry.
+	 */
+	if (result->rejected > 0) {
+		bool retry;
+		retry = progressMaybeDecrement(r, i, result->rejected,
+					       result->last_log_index);
+		if (retry) {
+			/* Retry, ignoring errors. */
+			tracef("log mismatch -> send old entries to %llu",
+			       server->id);
+			replicationProgress(r, i);
+		}
+		return 0;
+	}
+
+	/* In case of success the remote server is expected to send us back the
+	 * value of prevLogIndex + len(entriesToAppend). If it has a longer log,
+	 * it might be a leftover from previous terms. */
+	last_index = result->last_log_index;
+	if (last_index > logLastIndex(r->log)) {
+		last_index = logLastIndex(r->log);
+	}
+
+	/* If the RPC succeeded, update our counters for this server.
+	 *
+	 * From Figure 3.1:
+	 *
+	 *   [Rules for servers] Leaders:
+	 *
+	 *   If successful update nextIndex and matchIndex for follower.
+	 */
+	if (!progressMaybeUpdate(r, i, last_index)) {
+		return 0;
+	}
+
+	switch (progressState(r, i)) {
+		case PROGRESS__SNAPSHOT:
+			/* If a snapshot has been installed, transition back to
+			 * probe */
+			if (progressSnapshotDone(r, i)) {
+				progressToProbe(r, i);
+			}
+			break;
+		case PROGRESS__PROBE:
+			/* Transition to pipeline */
+			progressToPipeline(r, i);
+	}
+
+	/* If the server is currently being promoted and is catching with logs,
+	 * update the information about the current catch-up round, and possibly
+	 * proceed with the promotion. */
+	is_being_promoted = r->leader_state.promotee_id != 0 &&
+			    r->leader_state.promotee_id == server->id;
+	if (is_being_promoted) {
+		bool is_up_to_date = membershipUpdateCatchUpRound(r);
+		if (is_up_to_date) {
+			rv = triggerActualPromotion(r);
+			if (rv != 0) {
+				return rv;
+			}
+		}
+	}
+
+	/* Check if we can commit some new entries. */
+	replicationQuorum(r, last_index);
+
+	rv = replicationApply(r);
+	if (rv != 0) {
+		/* TODO: just log the error? */
+	}
+
+	/* Abort here we have been removed and we are not leaders anymore. */
+	if (r->state != RAFT_LEADER) {
+		goto out;
+	}
+
+	/* Get again the server index since it might have been removed from the
+	 * configuration. */
+	i = configurationIndexOf(&r->configuration, server->id);
+
+	if (i < r->configuration.n) {
+		/* If we are transferring leadership to this follower, check if
+		 * its log is now up-to-date and, if so, send it a TimeoutNow
+		 * RPC (unless we already did). */
+		if (r->transfer != NULL && r->transfer->id == server->id) {
+			if (progressPersistedIsUpToDate(r, i) &&
+			    r->transfer->send.data == NULL) {
+				rv = membershipLeadershipTransferStart(r);
+				if (rv != 0) {
+					membershipLeadershipTransferClose(r);
+				}
+			}
+		}
+		/* If this follower is in pipeline mode, send it more entries.
+		 */
+		if (progressState(r, i) == PROGRESS__PIPELINE) {
+			replicationProgress(r, i);
+		}
+	}
+
+out:
+	return 0;
+}
+
+static void sendAppendEntriesResultCb(struct raft_io_send *req, int status)
+{
+	(void)status;
+	RaftHeapFree(req);
+}
+
+static void sendAppendEntriesResult(
+    struct raft *r,
+    const struct raft_append_entries_result *result)
+{
+	struct raft_message message;
+	struct raft_io_send *req;
+	int rv;
+
+	assert(r->state == RAFT_FOLLOWER);
+	message.type = RAFT_IO_APPEND_ENTRIES_RESULT;
+	message.server_id = r->follower_state.current_leader.id;
+	message.server_address = r->follower_state.current_leader.address;
+	message.append_entries_result = *result;
+
+	req = raft_malloc(sizeof *req);
+	if (req == NULL) {
+		return;
+	}
+	req->data = r;
+
+	rv = r->io->send(r->io, req, &message, sendAppendEntriesResultCb);
+	if (rv != 0) {
+		raft_free(req);
+	}
+}
+
+/* Context for a write log entries request that was submitted by a follower. */
+struct appendFollower
+{
+	struct raft *raft; /* Instance that has submitted the request */
+	raft_index index;  /* Index of the first entry in the request. */
+	struct raft_append_entries args;
+	struct raft_io_append req;
+};
+
+static void appendFollowerCb(struct raft_io_append *req, int status)
+{
+	struct appendFollower *request = req->data;
+	struct raft *r = request->raft;
+	struct raft_append_entries *args = &request->args;
+	struct raft_append_entries_result result;
+	size_t i;
+	size_t j;
+	int rv;
+
+	tracef("I/O completed on follower: status %d", status);
+
+	assert(args->entries != NULL);
+	assert(args->n_entries > 0);
+
+	assert(r->state == RAFT_FOLLOWER || r->state == RAFT_UNAVAILABLE);
+	if (r->state == RAFT_UNAVAILABLE) {
+		goto out;
+	}
+	assert(r->follower_state.append_in_flight_count > 0);
+	r->follower_state.append_in_flight_count -= 1;
+
+	result.term = r->current_term;
+	result.version = RAFT_APPEND_ENTRIES_RESULT_VERSION;
+	result.features = RAFT_DEFAULT_FEATURE_FLAGS;
+	if (status != 0) {
+		ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
+		result.rejected = args->prev_log_index + 1;
+		goto respond;
+	}
+
+	/* We received an InstallSnapshot RPC while these entries were being
+	 * persisted to disk */
+	if (replicationInstallSnapshotBusy(r)) {
+		goto out;
+	}
+
+	i = updateLastStored(r, request->index, args->entries, args->n_entries);
+
+	/* If none of the entries that we persisted is present anymore in our
+	 * in-memory log, there's nothing to report or to do. We just discard
+	 * them. */
+	if (i == 0) {
+		goto out;
+	}
+
+	/* Possibly apply configuration changes as uncommitted. */
+	for (j = 0; j < i; j++) {
+		struct raft_entry *entry = &args->entries[j];
+		raft_index index = request->index + j;
+		raft_term local_term = logTermOf(r->log, index);
+
+		assert(local_term != 0 && local_term == entry->term);
+
+		if (entry->type == RAFT_CHANGE) {
+			rv = membershipUncommittedChange(r, index, entry);
+			if (rv != 0) {
+				goto out;
+			}
+		}
+	}
+
+	/* From Figure 3.1:
+	 *
+	 *   AppendEntries RPC: Receiver implementation: If leaderCommit >
+	 *   commitIndex, set commitIndex = min(leaderCommit, index of last new
+	 *   entry).
+	 */
+	if (args->leader_commit > r->commit_index &&
+	    r->last_stored >= r->commit_index) {
+		r->commit_index = min(args->leader_commit, r->last_stored);
+		rv = replicationApply(r);
+		if (rv != 0) {
+			goto out;
+		}
+	}
+
+	/* If our term number has changed since receiving these entries,
+	 * our current_leader may have changed as well, so don't send a response
+	 * to that server. */
+	if (r->current_term != args->term) {
+		tracef(
+		    "new role or term since receiving entries -> don't "
+		    "respond");
+		goto out;
+	}
+
+	result.rejected = 0;
+
+respond:
+	result.last_log_index = r->last_stored;
+	sendAppendEntriesResult(r, &result);
+
+out:
+	logRelease(r->log, request->index, request->args.entries,
+		   request->args.n_entries);
+
+	/* If the write failed, we need to truncate the log. */
+	if (status != 0) {
+		if (request->index <= logLastIndex(r->log)) {
+			logTruncate(r->log, request->index);
+		}
+	}
+
+	raft_free(request);
+}
+
+/* Check the log matching property against an incoming AppendEntries request.
+ *
+ * From Figure 3.1:
+ *
+ *   [AppendEntries RPC] Receiver implementation:
+ *
+ *   2. Reply false if log doesn't contain an entry at prevLogIndex whose
+ *   term matches prevLogTerm.
+ *
+ * Return 0 if the check passed.
+ *
+ * Return 1 if the check did not pass and the request needs to be rejected.
+ *
+ * Return -1 if there's a conflict and we need to shutdown. */
+static int checkLogMatchingProperty(struct raft *r,
+				    const struct raft_append_entries *args)
+{
+	raft_term local_prev_term;
+
+	/* If this is the very first entry, there's nothing to check. */
+	if (args->prev_log_index == 0) {
+		return 0;
+	}
+
+	local_prev_term = logTermOf(r->log, args->prev_log_index);
+	if (local_prev_term == 0) {
+		tracef("no entry at index %llu -> reject",
+		       args->prev_log_index);
+		return 1;
+	}
+
+	if (local_prev_term != args->prev_log_term) {
+		if (args->prev_log_index <= r->commit_index) {
+			/* Should never happen; something is seriously wrong! */
+			tracef(
+			    "conflicting terms %llu and %llu for entry %llu "
+			    "(commit "
+			    "index %llu) -> shutdown",
+			    local_prev_term, args->prev_log_term,
+			    args->prev_log_index, r->commit_index);
+			return -1;
+		}
+		tracef("previous term mismatch -> reject");
+		return 1;
+	}
+
+	return 0;
+}
+
+/* Delete from our log all entries that conflict with the ones in the given
+ * AppendEntries request.
+ *
+ * From Figure 3.1:
+ *
+ *   [AppendEntries RPC] Receiver implementation:
+ *
+ *   3. If an existing entry conflicts with a new one (same index but
+ *   different terms), delete the existing entry and all that follow it.
+ *
+ * The i output parameter will be set to the array index of the first new log
+ * entry that we don't have yet in our log, among the ones included in the given
+ * AppendEntries request. */
+static int deleteConflictingEntries(struct raft *r,
+				    const struct raft_append_entries *args,
+				    size_t *i)
+{
+	size_t j;
+	int rv;
+
+	for (j = 0; j < args->n_entries; j++) {
+		struct raft_entry *entry = &args->entries[j];
+		raft_index entry_index = args->prev_log_index + 1 + j;
+		raft_term local_term = logTermOf(r->log, entry_index);
+
+		if (local_term > 0 && local_term != entry->term) {
+			if (entry_index <= r->commit_index) {
+				/* Should never happen; something is seriously
+				 * wrong! */
+				tracef(
+				    "new index conflicts with committed entry "
+				    "-> shutdown");
+				return RAFT_SHUTDOWN;
+			}
+
+			tracef("log mismatch -> truncate (%llu)", entry_index);
+
+			/* Possibly discard uncommitted configuration changes.
+			 */
+			if (r->configuration_uncommitted_index >= entry_index) {
+				rv = membershipRollback(r);
+				if (rv != 0) {
+					return rv;
+				}
+			}
+
+			/* Delete all entries from this index on because they
+			 * don't match. */
+			rv = r->io->truncate(r->io, entry_index);
+			if (rv != 0) {
+				return rv;
+			}
+			logTruncate(r->log, entry_index);
+
+			/* Drop information about previously stored entries that
+			 * have just been discarded. */
+			if (r->last_stored >= entry_index) {
+				r->last_stored = entry_index - 1;
+			}
+
+			/* We want to append all entries from here on, replacing
+			 * anything that we had before. */
+			break;
+		} else if (local_term == 0) {
+			/* We don't have an entry at this index, so we want to
+			 * append this new one and all the subsequent ones. */
+			break;
+		}
+	}
+
+	*i = j;
+
+	return 0;
+}
+
+int replicationAppend(struct raft *r,
+		      const struct raft_append_entries *args,
+		      raft_index *rejected,
+		      bool *async)
+{
+	struct appendFollower *request;
+	int match;
+	size_t n;
+	size_t i;
+	size_t j;
+	bool reinstated;
+	int rv;
+
+	assert(r != NULL);
+	assert(args != NULL);
+	assert(rejected != NULL);
+	assert(async != NULL);
+
+	assert(r->state == RAFT_FOLLOWER);
+
+	*rejected = args->prev_log_index;
+	*async = false;
+
+	/* Check the log matching property. */
+	match = checkLogMatchingProperty(r, args);
+	if (match != 0) {
+		assert(match == 1 || match == -1);
+		return match == 1 ? 0 : RAFT_SHUTDOWN;
+	}
+
+	/* Delete conflicting entries. */
+	rv = deleteConflictingEntries(r, args, &i);
+	if (rv != 0) {
+		return rv;
+	}
+
+	*rejected = 0;
+
+	n = args->n_entries - i; /* Number of new entries */
+
+	/* If this is an empty AppendEntries, there's nothing to write. However
+	 * we still want to check if we can commit some entry. However, don't
+	 * commit anything while a snapshot install is busy, r->last_stored will
+	 * be 0 in that case.
+	 *
+	 * From Figure 3.1:
+	 *
+	 *   AppendEntries RPC: Receiver implementation: If leaderCommit >
+	 *   commitIndex, set commitIndex = min(leaderCommit, index of last new
+	 *   entry).
+	 */
+	if (n == 0) {
+		if ((args->leader_commit > r->commit_index) &&
+		    r->last_stored >= r->commit_index &&
+		    !replicationInstallSnapshotBusy(r)) {
+			r->commit_index =
+			    min(args->leader_commit, r->last_stored);
+			rv = replicationApply(r);
+			if (rv != 0) {
+				return rv;
+			}
+		}
+
+		return 0;
+	}
+
+	*async = true;
+
+	request = raft_malloc(sizeof *request);
+	if (request == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	request->raft = r;
+	request->args = *args;
+	/* Index of first new entry */
+	request->index = args->prev_log_index + 1 + i;
+
+	/* Update our in-memory log to reflect that we received these entries.
+	 * We'll notify the leader of a successful append once the write entries
+	 * request that we issue below actually completes.  */
+	for (j = 0; j < n; j++) {
+		struct raft_entry *entry = &args->entries[i + j];
+
+		/* We are trying to append an entry at index X with term T to
+		 * our in-memory log. If we've gotten this far, we know that the
+		 * log *logically* has no entry at this index. However, it's
+		 * possible that we're still hanging on to such an entry,
+		 * because we previously tried to append and replicate it, and
+		 * the associated disk write failed, but some send requests are
+		 * still pending that refer to it. Since the log is not capable
+		 * of tracking multiple independent entries that share an index
+		 * and term, we just piggyback on the already-stored entry in
+		 * this case. */
+		rv =
+		    logReinstate(r->log, entry->term, entry->type, &reinstated);
+		if (rv != 0) {
+			goto err_after_request_alloc;
+		} else if (reinstated) {
+			continue;
+		}
+
+		/* TODO This copy should not strictly be necessary, as the batch
+		 * logic will take care of freeing the batch buffer in which the
+		 * entries are received. However, this would lead to memory
+		 * spikes in certain edge cases.
+		 * https://github.com/canonical/dqlite/issues/276
+		 */
+		struct raft_entry copy = {0};
+		rv = entryCopy(entry, &copy);
+		if (rv != 0) {
+			goto err_after_request_alloc;
+		}
+
+		rv = logAppend(r->log, copy.term, copy.type, &copy.buf, NULL);
+		if (rv != 0) {
+			goto err_after_request_alloc;
+		}
+	}
+
+	/* Acquire the relevant entries from the log. */
+	rv = logAcquire(r->log, request->index, &request->args.entries,
+			&request->args.n_entries);
+	if (rv != 0) {
+		goto err_after_request_alloc;
+	}
+
+	assert(request->args.n_entries == n);
+	if (request->args.n_entries == 0) {
+		tracef("No log entries found at index %llu", request->index);
+		ErrMsgPrintf(r->errmsg, "No log entries found at index %llu",
+			     request->index);
+		rv = RAFT_SHUTDOWN;
+		goto err_after_acquire_entries;
+	}
+
+	request->req.data = request;
+	rv = r->io->append(r->io, &request->req, request->args.entries,
+			   request->args.n_entries, appendFollowerCb);
+	if (rv != 0) {
+		ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
+		goto err_after_acquire_entries;
+	}
+	r->follower_state.append_in_flight_count += 1;
+
+	entryBatchesDestroy(args->entries, args->n_entries);
+	return 0;
+
+err_after_acquire_entries:
+	/* Release the entries related to the IO request */
+	logRelease(r->log, request->index, request->args.entries,
+		   request->args.n_entries);
+
+err_after_request_alloc:
+	/* Release all entries added to the in-memory log, making
+	 * sure the in-memory log and disk don't diverge, leading
+	 * to future log entries not being persisted to disk.
+	 */
+	if (j != 0) {
+		logTruncate(r->log, request->index);
+	}
+	raft_free(request);
+
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+struct recvInstallSnapshot
+{
+	struct raft *raft;
+	struct raft_snapshot snapshot;
+	raft_term term; /* Used to check for state transitions. */
+};
+
+static void installSnapshotCb(struct raft_io_snapshot_put *req, int status)
+{
+	struct recvInstallSnapshot *request = req->data;
+	struct raft *r = request->raft;
+	struct raft_snapshot *snapshot = &request->snapshot;
+	struct raft_append_entries_result result;
+	bool should_respond = true;
+	int rv;
+
+	/* We avoid converting to candidate state while installing a snapshot.
+	 */
+	assert(r->state == RAFT_FOLLOWER || r->state == RAFT_UNAVAILABLE);
+
+	r->snapshot.put.data = NULL;
+
+	result.term = r->current_term;
+	result.version = RAFT_APPEND_ENTRIES_RESULT_VERSION;
+	result.features = RAFT_DEFAULT_FEATURE_FLAGS;
+	result.rejected = 0;
+
+	/* If we are shutting down, let's discard the result. */
+	if (r->state == RAFT_UNAVAILABLE) {
+		tracef(
+		    "shutting down -> discard result of snapshot installation");
+		should_respond = false;
+		goto discard;
+	}
+	/* If the request is from a previous term, it means that someone else
+	 * became a candidate while we were installing the snapshot. In that
+	 * case, we want to install the snapshot anyway, but our "current
+	 * leader" may no longer be the same as the server that sent the install
+	 * request, so we shouldn't send a response to that server. */
+	if (request->term != r->current_term) {
+		tracef(
+		    "new term since receiving snapshot -> install but don't "
+		    "respond");
+		should_respond = false;
+	}
+
+	if (status != 0) {
+		tracef("save snapshot %llu: %s", snapshot->index,
+		       raft_strerror(status));
+		goto discard;
+	}
+
+	/* From Figure 5.3:
+	 *
+	 *   7. Discard the entire log
+	 *   8. Reset state machine using snapshot contents (and load lastConfig
+	 *      as cluster configuration).
+	 */
+	rv = snapshotRestore(r, snapshot);
+	if (rv != 0) {
+		tracef("restore snapshot %llu: %s", snapshot->index,
+		       raft_strerror(status));
+		goto discard;
+	}
+
+	tracef("restored snapshot with last index %llu", snapshot->index);
+
+	goto respond;
+
+discard:
+	/* In case of error we must also free the snapshot data buffer and free
+	 * the configuration. */
+	result.rejected = snapshot->index;
+	raft_free(snapshot->bufs[0].base);
+	raft_free(snapshot->bufs);
+	raft_configuration_close(&snapshot->configuration);
+
+respond:
+	if (should_respond) {
+		result.last_log_index = r->last_stored;
+		sendAppendEntriesResult(r, &result);
+	}
+
+	raft_free(request);
+}
+
+int replicationInstallSnapshot(struct raft *r,
+			       const struct raft_install_snapshot *args,
+			       raft_index *rejected,
+			       bool *async)
+{
+	struct recvInstallSnapshot *request;
+	struct raft_snapshot *snapshot;
+	raft_term local_term;
+	int rv;
+
+	assert(r->state == RAFT_FOLLOWER);
+
+	*rejected = args->last_index;
+	*async = false;
+
+	/* If we are taking a snapshot ourselves or installing a snapshot,
+	 * ignore the request, the leader will eventually retry. TODO: we should
+	 * do something smarter. */
+	if (r->snapshot.pending.term != 0 || r->snapshot.put.data != NULL) {
+		*async = true;
+		tracef("already taking or installing snapshot");
+		return RAFT_BUSY;
+	}
+
+	/* If our last snapshot is more up-to-date, this is a no-op */
+	if (r->log->snapshot.last_index >= args->last_index) {
+		tracef("have more recent snapshot");
+		*rejected = 0;
+		return 0;
+	}
+
+	/* If we already have all entries in the snapshot, this is a no-op */
+	local_term = logTermOf(r->log, args->last_index);
+	if (local_term != 0 && local_term >= args->last_term) {
+		tracef("have all entries");
+		*rejected = 0;
+		return 0;
+	}
+
+	*async = true;
+
+	/* Preemptively update our in-memory state. */
+	logRestore(r->log, args->last_index, args->last_term);
+
+	r->last_stored = 0;
+
+	request = raft_malloc(sizeof *request);
+	if (request == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+	request->raft = r;
+	request->term = r->current_term;
+
+	snapshot = &request->snapshot;
+	snapshot->term = args->last_term;
+	snapshot->index = args->last_index;
+	snapshot->configuration_index = args->conf_index;
+	snapshot->configuration = args->conf;
+
+	snapshot->bufs = raft_malloc(sizeof *snapshot->bufs);
+	if (snapshot->bufs == NULL) {
+		rv = RAFT_NOMEM;
+		goto err_after_request_alloc;
+	}
+	snapshot->bufs[0] = args->data;
+	snapshot->n_bufs = 1;
+
+	assert(r->snapshot.put.data == NULL);
+	r->snapshot.put.data = request;
+	rv = r->io->snapshot_put(r->io,
+				 0 /* zero trailing means replace everything */,
+				 &r->snapshot.put, snapshot, installSnapshotCb);
+	if (rv != 0) {
+		tracef("snapshot_put failed %d", rv);
+		goto err_after_bufs_alloc;
+	}
+
+	return 0;
+
+err_after_bufs_alloc:
+	raft_free(snapshot->bufs);
+	r->snapshot.put.data = NULL;
+err_after_request_alloc:
+	raft_free(request);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+/* Apply a RAFT_COMMAND entry that has been committed. */
+static int applyCommand(struct raft *r,
+			const raft_index index,
+			const struct raft_buffer *buf)
+{
+	struct raft_apply *req;
+	void *result;
+	int rv;
+	rv = r->fsm->apply(r->fsm, buf, &result);
+	if (rv != 0) {
+		return rv;
+	}
+
+	r->last_applied = index;
+
+	req = (struct raft_apply *)getRequest(r, index, RAFT_COMMAND);
+	if (req != NULL && req->cb != NULL) {
+		req->cb(req, 0, result);
+	}
+	return 0;
+}
+
+/* Fire the callback of a barrier request whose entry has been committed. */
+static void applyBarrier(struct raft *r, const raft_index index)
+{
+	r->last_applied = index;
+
+	struct raft_barrier *req;
+	req = (struct raft_barrier *)getRequest(r, index, RAFT_BARRIER);
+	if (req != NULL && req->cb != NULL) {
+		req->cb(req, 0);
+	}
+}
+
+/* Apply a RAFT_CHANGE entry that has been committed. */
+static void applyChange(struct raft *r, const raft_index index)
+{
+	struct raft_change *req;
+
+	assert(index > 0);
+
+	/* If this is an uncommitted configuration that we had already applied
+	 * when submitting the configuration change (for leaders) or upon
+	 * receiving it via an AppendEntries RPC (for followers), then reset the
+	 * uncommitted index, since that uncommitted configuration is now
+	 * committed. */
+	if (r->configuration_uncommitted_index == index) {
+		tracef("configuration at index:%llu is committed.", index);
+		r->configuration_uncommitted_index = 0;
+	}
+
+	r->configuration_committed_index = index;
+	r->last_applied = index;
+
+	if (r->state == RAFT_LEADER) {
+		const struct raft_server *server;
+		req = r->leader_state.change;
+		r->leader_state.change = NULL;
+
+		/* If we are leader but not part of this new configuration, step
+		 * down.
+		 *
+		 * From Section 4.2.2:
+		 *
+		 *   In this approach, a leader that is removed from the
+		 * configuration steps down once the Cnew entry is committed.
+		 */
+		server = configurationGet(&r->configuration, r->id);
+		if (server == NULL || server->role != RAFT_VOTER) {
+			tracef(
+			    "leader removed from config or no longer voter "
+			    "server: %p",
+			    (void *)server);
+			convertToFollower(r);
+		}
+
+		if (req != NULL && req->cb != NULL) {
+			req->cb(req, 0);
+		}
+	}
+}
+
+static bool shouldTakeSnapshot(struct raft *r)
+{
+	/* If we are shutting down, let's not do anything. */
+	if (r->state == RAFT_UNAVAILABLE) {
+		return false;
+	}
+
+	/* If a snapshot is already in progress or we're installing a snapshot,
+	 * we don't want to start another one. */
+	if (r->snapshot.pending.term != 0 || r->snapshot.put.data != NULL) {
+		return false;
+	};
+
+	/* If we didn't reach the threshold yet, do nothing. */
+	if (r->last_applied - r->log->snapshot.last_index <
+	    r->snapshot.threshold) {
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * When taking a snapshot, ownership of the snapshot data is with raft if
+ * `snapshot_finalize` is NULL.
+ */
+static void takeSnapshotClose(struct raft *r, struct raft_snapshot *s)
+{
+	if (r->fsm->version == 1 ||
+	    (r->fsm->version > 1 && r->fsm->snapshot_finalize == NULL)) {
+		snapshotClose(s);
+		return;
+	}
+
+	configurationClose(&s->configuration);
+	r->fsm->snapshot_finalize(r->fsm, &s->bufs, &s->n_bufs);
+}
+
+static void takeSnapshotCb(struct raft_io_snapshot_put *req, int status)
+{
+	struct raft *r = req->data;
+	struct raft_snapshot *snapshot;
+	int rv;
+
+	r->snapshot.put.data = NULL;
+	snapshot = &r->snapshot.pending;
+
+	if (status != 0) {
+		tracef("snapshot %lld at term %lld: %s", snapshot->index,
+		       snapshot->term, raft_strerror(status));
+		goto out;
+	}
+
+	/* Cache the configuration contained in the snapshot. While the snapshot
+	 * was written, new configuration changes could have been committed,
+	 * these changes will not be purged from the log by this snapshot.
+	 * However we still cache the configuration for consistency. */
+	configurationClose(&r->configuration_last_snapshot);
+	rv = configurationCopy(&snapshot->configuration,
+			       &r->configuration_last_snapshot);
+	if (rv != 0) {
+		/* TODO: make this a hard fault, because if we have no backup
+		 * and the log was truncated it will be impossible to rollback
+		 * an aborted configuration change. */
+		tracef("failed to backup last committed configuration.");
+	}
+	logSnapshot(r->log, snapshot->index, r->snapshot.trailing);
+out:
+	takeSnapshotClose(r, snapshot);
+	r->snapshot.pending.term = 0;
+}
+
+static int putSnapshot(struct raft *r,
+		       struct raft_snapshot *snapshot,
+		       raft_io_snapshot_put_cb cb)
+{
+	int rv;
+	assert(r->snapshot.put.data == NULL);
+	r->snapshot.put.data = r;
+	rv = r->io->snapshot_put(r->io, r->snapshot.trailing, &r->snapshot.put,
+				 snapshot, cb);
+	if (rv != 0) {
+		takeSnapshotClose(r, snapshot);
+		r->snapshot.pending.term = 0;
+		r->snapshot.put.data = NULL;
+	}
+
+	return rv;
+}
+
+static void takeSnapshotDoneCb(struct raft_io_async_work *take, int status)
+{
+	struct raft *r = take->data;
+	struct raft_snapshot *snapshot = &r->snapshot.pending;
+	int rv;
+
+	raft_free(take);
+
+	if (status != 0) {
+		tracef("take snapshot failed %s", raft_strerror(status));
+		takeSnapshotClose(r, snapshot);
+		r->snapshot.pending.term = 0;
+		r->snapshot.put.data = NULL;
+		return;
+	}
+
+	rv = putSnapshot(r, snapshot, takeSnapshotCb);
+	if (rv != 0) {
+		tracef("put snapshot failed %d", rv);
+	}
+}
+
+static int takeSnapshotAsync(struct raft_io_async_work *take)
+{
+	struct raft *r = take->data;
+	tracef("take snapshot async at %lld", r->snapshot.pending.index);
+	struct raft_snapshot *snapshot = &r->snapshot.pending;
+	return r->fsm->snapshot_async(r->fsm, &snapshot->bufs,
+				      &snapshot->n_bufs);
+}
+
+static int takeSnapshot(struct raft *r)
+{
+	struct raft_snapshot *snapshot;
+	int rv;
+
+	tracef("take snapshot at %lld", r->last_applied);
+
+	snapshot = &r->snapshot.pending;
+	snapshot->index = r->last_applied;
+	snapshot->term = logTermOf(r->log, r->last_applied);
+	snapshot->bufs = NULL;
+	snapshot->n_bufs = 0;
+
+	rv = membershipFetchLastCommittedConfiguration(
+	    r, &snapshot->configuration);
+	if (rv != 0) {
+		goto abort;
+	}
+	snapshot->configuration_index = r->configuration_committed_index;
+
+	rv = r->fsm->snapshot(r->fsm, &snapshot->bufs, &snapshot->n_bufs);
+	if (rv != 0) {
+		/* Ignore transient errors. We'll retry next time. */
+		if (rv == RAFT_BUSY) {
+			rv = 0;
+		}
+		raft_configuration_close(&snapshot->configuration);
+		goto abort;
+	}
+
+	bool sync_snapshot =
+	    r->fsm->version < 3 || r->fsm->snapshot_async == NULL;
+	if (sync_snapshot) {
+		/* putSnapshot will clean up config and buffers in case of error
+		 */
+		return putSnapshot(r, snapshot, takeSnapshotCb);
+	} else {
+		struct raft_io_async_work *take = raft_malloc(sizeof(*take));
+		if (take == NULL) {
+			rv = RAFT_NOMEM;
+			goto abort_after_snapshot;
+		}
+		take->data = r;
+		take->work = takeSnapshotAsync;
+		rv = r->io->async_work(r->io, take, takeSnapshotDoneCb);
+		if (rv != 0) {
+			raft_free(take);
+			goto abort_after_snapshot;
+		}
+	}
+
+	return 0;
+
+abort_after_snapshot:
+	/* Closes config and finalizes snapshot */
+	takeSnapshotClose(r, snapshot);
+abort:
+	r->snapshot.pending.term = 0;
+	return rv;
+}
+
+int replicationApply(struct raft *r)
+{
+	raft_index index;
+	int rv = 0;
+
+	assert(r->state == RAFT_LEADER || r->state == RAFT_FOLLOWER);
+	assert(r->last_applied <= r->commit_index);
+
+	if (r->last_applied == r->commit_index) {
+		/* Nothing to do. */
+		return 0;
+	}
+
+	for (index = r->last_applied + 1; index <= r->commit_index; index++) {
+		const struct raft_entry *entry = logGet(r->log, index);
+		if (entry == NULL) {
+			/* This can happen while installing a snapshot */
+			tracef("replicationApply - ENTRY NULL");
+			return 0;
+		}
+
+		assert(entry->type == RAFT_COMMAND ||
+		       entry->type == RAFT_BARRIER ||
+		       entry->type == RAFT_CHANGE);
+
+		switch (entry->type) {
+			case RAFT_COMMAND:
+				rv = applyCommand(r, index, &entry->buf);
+				break;
+			case RAFT_BARRIER:
+				applyBarrier(r, index);
+				rv = 0;
+				break;
+			case RAFT_CHANGE:
+				applyChange(r, index);
+				rv = 0;
+				break;
+			default:
+				rv = 0; /* For coverity. This case can't be
+					   taken. */
+				break;
+		}
+
+		if (rv != 0) {
+			break;
+		}
+	}
+
+	if (shouldTakeSnapshot(r)) {
+		rv = takeSnapshot(r);
+	}
+
+	return rv;
+}
+
+void replicationQuorum(struct raft *r, const raft_index index)
+{
+	size_t votes = 0;
+	size_t i;
+	raft_term term;
+
+	assert(r->state == RAFT_LEADER);
+
+	if (index <= r->commit_index) {
+		return;
+	}
+
+	term = logTermOf(r->log, index);
+
+	/* TODO: fuzzy-test --seed 0x8db5fccc replication/entries/partitioned
+	 * fails the assertion below. */
+	if (term == 0) {
+		return;
+	}
+	// assert(logTermOf(r->log, index) > 0);
+	assert(!(term > r->current_term));
+
+	/* Don't commit entries from previous terms by counting replicas. */
+	if (term < r->current_term) {
+		return;
+	}
+
+	for (i = 0; i < r->configuration.n; i++) {
+		struct raft_server *server = &r->configuration.servers[i];
+		if (server->role != RAFT_VOTER) {
+			continue;
+		}
+		if (r->leader_state.progress[i].match_index >= index) {
+			votes++;
+		}
+	}
+
+	if (votes > configurationVoterCount(&r->configuration) / 2) {
+		r->commit_index = index;
+		tracef("new commit index %llu", r->commit_index);
+	}
+
+	return;
+}
+
+inline bool replicationInstallSnapshotBusy(struct raft *r)
+{
+	return r->last_stored == 0 && r->snapshot.put.data != NULL;
+}
+
+#undef tracef
diff --git a/src/raft/replication.h b/src/raft/replication.h
new file mode 100644
index 000000000..5bfe07dbe
--- /dev/null
+++ b/src/raft/replication.h
@@ -0,0 +1,98 @@
+/* Log replication logic and helpers. */
+
+#ifndef REPLICATION_H_
+#define REPLICATION_H_
+
+#include "../raft.h"
+
+/* Send AppendEntries RPC messages to all followers to which no AppendEntries
+ * was sent in the last heartbeat interval. */
+int replicationHeartbeat(struct raft *r);
+
+/* Start a local disk write for entries from the given index onwards, and
+ * trigger replication against all followers, typically sending AppendEntries
+ * RPC messages with outstanding log entries. */
+int replicationTrigger(struct raft *r, raft_index index);
+
+/* Possibly send an AppendEntries or an InstallSnapshot RPC message to the
+ * server with the given index.
+ *
+ * The rules to decide whether or not to send a message are:
+ *
+ * - If we have sent an InstallSnapshot RPC recently and we haven't yet received
+ *   a response, then don't send any new message.
+ *
+ * - If we are probing the follower (i.e. we haven't received a successful
+ *   response during the last heartbeat interval), then send a message only if
+ *   haven't sent any during the last heartbeat interval.
+ *
+ * - If we are pipelining entries to the follower, then send any new entries
+ *   haven't yet sent.
+ *
+ * If a message should be sent, the rules to decide what type of message to send
+ * and what it should contain are:
+ *
+ * - If we don't have anymore the first entry that should be sent to the
+ *   follower, then send an InstallSnapshot RPC with the last snapshot.
+ *
+ * - If we still have the first entry to send, then send all entries from that
+     index onward (possibly zero).
+ *
+ * This function must be called only by leaders. */
+int replicationProgress(struct raft *r, unsigned i);
+
+/* Update the replication state (match and next indexes) for the given server
+ * using the given AppendEntries RPC result.
+ *
+ * Possibly send to the server a new set of entries or a snapshot if the result
+ * was unsuccessful because of missing entries or if new entries were added to
+ * our log in the meantime.
+ *
+ * It must be called only by leaders. */
+int replicationUpdate(struct raft *r,
+		      const struct raft_server *server,
+		      const struct raft_append_entries_result *result);
+
+/* Append the log entries in the given request if the Log Matching Property is
+ * satisfied.
+ *
+ * The rejected output parameter will be set to 0 if the Log Matching Property
+ * was satisfied, or to args->prev_log_index if not.
+ *
+ * The async output parameter will be set to true if some of the entries in the
+ * request were not present in our log, and a disk write was started to persist
+ * them to disk. The entries will still be appended immediately to our in-memory
+ * copy of the log, but an AppendEntries result message will be sent only once
+ * the disk write completes and the I/O callback is invoked.
+ *
+ * It must be called only by followers. */
+int replicationAppend(struct raft *r,
+		      const struct raft_append_entries *args,
+		      raft_index *rejected,
+		      bool *async);
+
+int replicationInstallSnapshot(struct raft *r,
+			       const struct raft_install_snapshot *args,
+			       raft_index *rejected,
+			       bool *async);
+
+/* Returns `true` if the raft instance is currently installing a snapshot */
+bool replicationInstallSnapshotBusy(struct raft *r);
+
+/* Apply any committed entry that was not applied yet.
+ *
+ * It must be called by leaders or followers. */
+int replicationApply(struct raft *r);
+
+/* Check if a quorum has been reached for the given log index, and update the
+ * commit index accordingly if so.
+ *
+ * From Figure 3.1:
+ *
+ *   [Rules for servers] Leaders:
+ *
+ *   If there exists an N such that N > commitIndex, a majority of
+ *   matchIndex[i] >= N, and log[N].term == currentTerm: set commitIndex = N */
+void replicationQuorum(struct raft *r, const raft_index index);
+
+#endif /* REPLICATION_H_ */
diff --git a/src/raft/request.h b/src/raft/request.h
new file mode 100644
index 000000000..08ad4a36b
--- /dev/null
+++ b/src/raft/request.h
@@ -0,0 +1,20 @@
+#ifndef REQUEST_H_
+#define REQUEST_H_
+
+#include "../raft.h"
+
+/* Abstract request type */
+struct request
+{
+	/* Must be kept in sync with RAFT__REQUEST in raft.h */
+	void *data;
+	int type;
+	raft_index index;
+	void *queue[2];
+	uint8_t req_id[16];
+	uint8_t client_id[16];
+	uint8_t unique_id[16];
+	uint64_t reserved[4];
+};
+
+#endif /* REQUEST_H_ */
diff --git a/src/raft/snapshot.c b/src/raft/snapshot.c
new file mode 100644
index 000000000..d05994fcb
--- /dev/null
+++ b/src/raft/snapshot.c
@@ -0,0 +1,114 @@
+#include "snapshot.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include "../tracing.h"
+#include "assert.h"
+#include "configuration.h"
+#include "err.h"
+#include "log.h"
+
+void snapshotClose(struct raft_snapshot *s)
+{
+	unsigned i;
+	configurationClose(&s->configuration);
+	for (i = 0; i < s->n_bufs; i++) {
+		raft_free(s->bufs[i].base);
+	}
+	raft_free(s->bufs);
+}
+
+void snapshotDestroy(struct raft_snapshot *s)
+{
+	snapshotClose(s);
+	raft_free(s);
+}
+
+int snapshotRestore(struct raft *r, struct raft_snapshot *snapshot)
+{
+	int rv;
+
+	assert(snapshot->n_bufs == 1);
+
+	rv = r->fsm->restore(r->fsm, &snapshot->bufs[0]);
+	if (rv != 0) {
+		tracef("restore snapshot %llu: %s", snapshot->index,
+		       errCodeToString(rv));
+		return rv;
+	}
+
+	configurationClose(&r->configuration);
+	r->configuration = snapshot->configuration;
+	r->configuration_committed_index = snapshot->configuration_index;
+	r->configuration_uncommitted_index = 0;
+
+	/* Make a copy of the configuration contained in the snapshot, in case
+	 * r->configuration gets overriden with an uncommitted configuration and
+	 * we then need to rollback, but the log does not contain anymore the
+	 * entry at r->configuration_committed_index because it was truncated.
+	 */
+	configurationClose(&r->configuration_last_snapshot);
+	rv = configurationCopy(&r->configuration,
+			       &r->configuration_last_snapshot);
+	if (rv != 0) {
+		return rv;
+	}
+
+	configurationTrace(r, &r->configuration,
+			   "configuration restore from snapshot");
+
+	r->commit_index = snapshot->index;
+	r->last_applied = snapshot->index;
+	r->last_stored = snapshot->index;
+
+	/* Don't free the snapshot data buffer, as ownership has been
+	 * transferred to the fsm. */
+	raft_free(snapshot->bufs);
+
+	return 0;
+}
+
+int snapshotCopy(const struct raft_snapshot *src, struct raft_snapshot *dst)
+{
+	int rv;
+	unsigned i;
+	size_t size;
+	uint8_t *cursor;
+
+	dst->term = src->term;
+	dst->index = src->index;
+	dst->configuration_index = src->configuration_index;
+
+	rv = configurationCopy(&src->configuration, &dst->configuration);
+	if (rv != 0) {
+		return rv;
+	}
+
+	size = 0;
+	for (i = 0; i < src->n_bufs; i++) {
+		size += src->bufs[i].len;
+	}
+
+	dst->bufs = raft_malloc(sizeof *dst->bufs);
+	assert(dst->bufs != NULL);
+
+	dst->bufs[0].base = raft_malloc(size);
+	dst->bufs[0].len = size;
+	if (dst->bufs[0].base == NULL) {
+		return RAFT_NOMEM;
+	}
+
+	cursor = dst->bufs[0].base;
+
+	for (i = 0; i < src->n_bufs; i++) {
+		memcpy(cursor, src->bufs[i].base, src->bufs[i].len);
+		cursor += src->bufs[i].len;
+	}
+
+	dst->n_bufs = 1;
+
+	return 0;
+}
+
+#undef tracef
diff --git a/src/raft/snapshot.h b/src/raft/snapshot.h
new file mode 100644
index 000000000..90ab1b337
--- /dev/null
+++ b/src/raft/snapshot.h
@@ -0,0 +1,28 @@
+#ifndef RAFT_SNAPSHOT_H_
+#define RAFT_SNAPSHOT_H_
+
+#include "../raft.h"
+
+/* Release all memory associated with the given snapshot. */
+void snapshotClose(struct raft_snapshot *s);
+
+/* Like snapshotClose(), but also release the snapshot object itself. */
+void snapshotDestroy(struct raft_snapshot *s);
+
+/* Restore a snapshot.
+ *
+ * This will reset the current state of the server as if the last entry
+ * contained in the snapshot had just been persisted, committed and applied.
+ *
+ * The in-memory log must be empty when calling this function.
+ *
+ * If no error occurs, the memory of the snapshot object gets released. */
+int snapshotRestore(struct raft *r, struct raft_snapshot *snapshot);
+
+/* Make a full deep copy of a snapshot object.
+ *
+ * All data buffers in the source snapshot will be compacted in a single buffer
+ * in the destination snapshot. */
+int snapshotCopy(const struct raft_snapshot *src, struct raft_snapshot *dst);
+
+#endif /* RAFT_SNAPSHOT_H */
diff --git a/src/raft/start.c b/src/raft/start.c
new file mode 100644
index 000000000..023d51e74
--- /dev/null
+++ b/src/raft/start.c
@@ -0,0 +1,232 @@
+#include "../raft.h"
+#include "../tracing.h"
+#include "assert.h"
+#include "configuration.h"
+#include "convert.h"
+#include "entry.h"
+#include "err.h"
+#include "log.h"
+#include "recv.h"
+#include "snapshot.h"
+#include "tick.h"
+
+/* Restore the most recent configuration entry found in the log. */
+static int restoreMostRecentConfigurationEntry(struct raft *r,
+					       struct raft_entry *entry,
+					       raft_index index)
+{
+	struct raft_configuration configuration;
+	int rv;
+
+	rv = configurationDecode(&entry->buf, &configuration);
+	if (rv != 0) {
+		configurationClose(&configuration);
+		return rv;
+	}
+
+	configurationClose(&r->configuration);
+	r->configuration = configuration;
+
+	/* If the configuration comes from entry at index 1 in the log, we know
+	 * it's the bootstrap configuration and it's committed by default.
+	 * Otherwise we we can't know if it's committed or not and treat it as
+	 * uncommitted. */
+	if (index == 1) {
+		assert(r->configuration_uncommitted_index == 0);
+		r->configuration_committed_index = 1;
+	} else {
+		assert(r->configuration_committed_index < index);
+		r->configuration_uncommitted_index = index;
+	}
+
+	configurationTrace(r, &r->configuration,
+			   "restore most recent configuration");
+	return 0;
+}
+
+/* Restore the entries that were loaded from persistent storage. The most recent
+ * configuration entry will be restored as well, if any.
+ *
+ * Note that if the last configuration entry in the log has index greater than
+ * one we cannot know if it is committed or not. Therefore we also need to track
+ * the second-to-last configuration entry. This second-to-last entry is
+ * committed by default as raft doesn't allow multiple uncommitted configuration
+ * entries. That entry is used in case of configuration rollback scenarios. If
+ * we don't find the second-to-last configuration entry in the log, it means
+ * that the log was truncated after a snapshot and second-to-last configuration
+ * is available in r->configuration_last_snapshot, which we popolated earlier
+ * when the snapshot was restored. */
+static int restoreEntries(struct raft *r,
+			  raft_index snapshot_index,
+			  raft_term snapshot_term,
+			  raft_index start_index,
+			  struct raft_entry *entries,
+			  size_t n)
+{
+	struct raft_entry *conf = NULL;
+	raft_index conf_index = 0;
+	size_t i;
+	int rv;
+	logStart(r->log, snapshot_index, snapshot_term, start_index);
+	r->last_stored = start_index - 1;
+	for (i = 0; i < n; i++) {
+		struct raft_entry *entry = &entries[i];
+		rv = logAppend(r->log, entry->term, entry->type, &entry->buf,
+			       entry->batch);
+		if (rv != 0) {
+			goto err;
+		}
+		r->last_stored++;
+
+		/* Only take into account configurations that are newer than the
+		 * configuration restored from the snapshot. */
+		if (entry->type == RAFT_CHANGE &&
+		    r->last_stored > r->configuration_committed_index) {
+			/* If there is a previous configuration it must have
+			 * been committed as we don't allow multiple uncommitted
+			 * configurations. At the end of the loop
+			 * r->configuration_committed_index will point to the
+			 * second to last configuration entry, if any. */
+			if (conf_index != 0) {
+				r->configuration_committed_index = conf_index;
+			}
+			conf = entry;
+			conf_index = r->last_stored;
+		}
+	}
+
+	if (conf != NULL) {
+		rv = restoreMostRecentConfigurationEntry(r, conf, conf_index);
+		if (rv != 0) {
+			goto err;
+		}
+	}
+
+	raft_free(entries);
+	return 0;
+
+err:
+	if (logNumEntries(r->log) > 0) {
+		logDiscard(r->log, r->log->offset + 1);
+	}
+	return rv;
+}
+
+/* If we're the only voting server in the configuration, automatically
+ * self-elect ourselves and convert to leader without waiting for the election
+ * timeout. */
+static int maybeSelfElect(struct raft *r)
+{
+	const struct raft_server *server;
+	int rv;
+	server = configurationGet(&r->configuration, r->id);
+	if (server == NULL || server->role != RAFT_VOTER ||
+	    configurationVoterCount(&r->configuration) > 1) {
+		return 0;
+	}
+	/* Converting to candidate will notice that we're the only voter and
+	 * automatically convert to leader. */
+	rv = convertToCandidate(r, false /* disrupt leader */);
+	if (rv != 0) {
+		return rv;
+	}
+	assert(r->state == RAFT_LEADER);
+	return 0;
+}
+
+int raft_start(struct raft *r)
+{
+	struct raft_snapshot *snapshot;
+	raft_index snapshot_index = 0;
+	raft_term snapshot_term = 0;
+	raft_index start_index;
+	struct raft_entry *entries;
+	size_t n_entries;
+	int rv;
+
+	assert(r != NULL);
+	assert(r->state == RAFT_UNAVAILABLE);
+	assert(r->heartbeat_timeout != 0);
+	assert(r->heartbeat_timeout < r->election_timeout);
+	assert(r->install_snapshot_timeout != 0);
+	assert(logNumEntries(r->log) == 0);
+	assert(logSnapshotIndex(r->log) == 0);
+	assert(r->last_stored == 0);
+
+#ifndef RAFT_REVISION
+#define RAFT_REVISION "unknown"
+#endif
+	tracef("starting version:%d revision:%s", raft_version_number(),
+	       RAFT_REVISION);
+	rv = r->io->load(r->io, &r->current_term, &r->voted_for, &snapshot,
+			 &start_index, &entries, &n_entries);
+	if (rv != 0) {
+		ErrMsgTransfer(r->io->errmsg, r->errmsg, "io");
+		return rv;
+	}
+	assert(start_index >= 1);
+	tracef(
+	    "current_term:%llu voted_for:%llu start_index:%llu n_entries:%zu",
+	    r->current_term, r->voted_for, start_index, n_entries);
+
+	/* If we have a snapshot, let's restore it. */
+	if (snapshot != NULL) {
+		tracef(
+		    "restore snapshot with last index %llu and last term %llu",
+		    snapshot->index, snapshot->term);
+		rv = snapshotRestore(r, snapshot);
+		if (rv != 0) {
+			snapshotDestroy(snapshot);
+			entryBatchesDestroy(entries, n_entries);
+			return rv;
+		}
+		snapshot_index = snapshot->index;
+		snapshot_term = snapshot->term;
+		raft_free(snapshot);
+	} else if (n_entries > 0) {
+		/* If we don't have a snapshot and the on-disk log is not empty,
+		 * then the first entry must be a configuration entry. */
+		assert(start_index == 1);
+		assert(entries[0].type == RAFT_CHANGE);
+
+		/* As a small optimization, bump the commit index to 1 since we
+		 * require the first entry to be the same on all servers. */
+		r->commit_index = 1;
+		r->last_applied = 1;
+	}
+
+	/* Append the entries to the log, possibly restoring the last
+	 * configuration. */
+	tracef("restore %zu entries starting at %llu", n_entries, start_index);
+	rv = restoreEntries(r, snapshot_index, snapshot_term, start_index,
+			    entries, n_entries);
+	if (rv != 0) {
+		entryBatchesDestroy(entries, n_entries);
+		return rv;
+	}
+
+	/* Start the I/O backend. The tickCb function is expected to fire every
+	 * r->heartbeat_timeout milliseconds and recvCb whenever an RPC is
+	 * received. */
+	rv = r->io->start(r->io, r->heartbeat_timeout, tickCb, recvCb);
+	if (rv != 0) {
+		tracef("io start failed %d", rv);
+		return rv;
+	}
+
+	/* By default we start as followers. */
+	convertToFollower(r);
+
+	/* If there's only one voting server, and that is us, it's safe to
+	 * convert to leader right away. If that is not us, we're either joining
+	 * the cluster or we're simply configured as non-voter, and we'll stay
+	 * follower. */
+	rv = maybeSelfElect(r);
+	if (rv != 0) {
+		return rv;
+	}
+
+	return 0;
+}
+
+#undef tracef
diff --git a/src/raft/state.c b/src/raft/state.c
new file mode 100644
index 000000000..af46d76d9
--- /dev/null
+++ b/src/raft/state.c
@@ -0,0 +1,54 @@
+#include "assert.h"
+#include "configuration.h"
+#include "election.h"
+#include "log.h"
+#include "queue.h"
+
+int raft_state(struct raft *r)
+{
+	return r->state;
+}
+
+void raft_leader(struct raft *r, raft_id *id, const char **address)
+{
+	switch (r->state) {
+		case RAFT_UNAVAILABLE:
+		case RAFT_CANDIDATE:
+			*id = 0;
+			*address = NULL;
+			return;
+		case RAFT_FOLLOWER:
+			*id = r->follower_state.current_leader.id;
+			*address = r->follower_state.current_leader.address;
+			return;
+		case RAFT_LEADER:
+			if (r->transfer != NULL) {
+				*id = 0;
+				*address = NULL;
+				return;
+			}
+			*id = r->id;
+			*address = r->address;
+			return;
+	}
+}
+
+raft_index raft_last_index(struct raft *r)
+{
+	return logLastIndex(r->log);
+}
+
+raft_index raft_last_applied(struct raft *r)
+{
+	return r->last_applied;
+}
+
+int raft_role(struct raft *r)
+{
+	const struct raft_server *local =
+	    configurationGet(&r->configuration, r->id);
+	if (local == NULL) {
+		return -1;
+	}
+	return local->role;
+}
diff --git a/src/raft/syscall.c b/src/raft/syscall.c
new file mode 100644
index 000000000..12c4390a0
--- /dev/null
+++ b/src/raft/syscall.c
@@ -0,0 +1,58 @@
+#include "syscall.h"
+
+#if HAVE_LINUX_AIO_ABI_H || HAVE_LINUX_IO_URING_H
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+#if HAVE_LINUX_AIO_ABI_H
+int io_setup(unsigned nr_events, aio_context_t *ctx_idp)
+{
+	return (int)syscall(__NR_io_setup, nr_events, ctx_idp);
+}
+
+int io_destroy(aio_context_t ctx_id)
+{
+	return (int)syscall(__NR_io_destroy, ctx_id);
+}
+
+int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp)
+{
+	return (int)syscall(__NR_io_submit, ctx_id, nr, iocbpp);
+}
+
+int io_getevents(aio_context_t ctx_id,
+		 long min_nr,
+		 long nr,
+		 struct io_event *events,
+		 struct timespec *timeout)
+{
+	return (int)syscall(__NR_io_getevents, ctx_id, min_nr, nr, events,
+			    timeout);
+}
+#endif
+
+#if HAVE_LINUX_IO_URING_H
+int io_uring_register(int fd,
+		      unsigned int opcode,
+		      const void *arg,
+		      unsigned int nr_args)
+{
+	return (int)syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
+}
+
+int io_uring_setup(unsigned int entries, struct io_uring_params *p)
+{
+	return (int)syscall(__NR_io_uring_setup, entries, p);
+}
+
+int io_uring_enter(int fd,
+		   unsigned int to_submit,
+		   unsigned int min_complete,
+		   unsigned int flags,
+		   sigset_t *sig)
+{
+	return (int)syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
+			    flags, sig, _NSIG / 8);
+}
+#endif
diff --git a/src/raft/syscall.h b/src/raft/syscall.h
new file mode 100644
index 000000000..c8459fffc
--- /dev/null
+++ b/src/raft/syscall.h
@@ -0,0 +1,47 @@
+/* Wrappers for system calls not yet defined in libc. */
+
+#ifndef SYSCALL_H_
+#define SYSCALL_H_
+
+#if HAVE_LINUX_AIO_ABI_H
+#include <linux/aio_abi.h>
+#include <signal.h>
+#include <time.h>
+#endif
+
+#if HAVE_LINUX_IO_URING_H
+#include <linux/io_uring.h>
+#endif
+
+#if HAVE_LINUX_AIO_ABI_H
+/* AIO */
+int io_setup(unsigned nr_events, aio_context_t *ctx_idp);
+
+int io_destroy(aio_context_t ctx_id);
+
+int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocbpp);
+
+int io_getevents(aio_context_t ctx_id,
+		 long min_nr,
+		 long nr,
+		 struct io_event *events,
+		 struct timespec *timeout);
+#endif
+
+#if HAVE_LINUX_IO_URING_H
+/* uring */
+int io_uring_register(int fd,
+		      unsigned int opcode,
+		      const void *arg,
+		      unsigned int nr_args);
+
+int io_uring_setup(unsigned int entries, struct io_uring_params *p);
+
+int io_uring_enter(int fd,
+		   unsigned int to_submit,
+		   unsigned int min_complete,
+		   unsigned int flags,
+		   sigset_t *sig);
+#endif
+
+#endif /* SYSCALL_ */
diff --git a/src/raft/tick.c b/src/raft/tick.c
new file mode 100644
index 000000000..f6dd407c7
--- /dev/null
+++ b/src/raft/tick.c
@@ -0,0 +1,259 @@
+#include "../raft.h"
+#include "../tracing.h"
+#include "assert.h"
+#include "configuration.h"
+#include "convert.h"
+#include "election.h"
+#include "membership.h"
+#include "progress.h"
+#include "replication.h"
+
+/* Apply time-dependent rules for followers (Figure 3.1). */
+static int tickFollower(struct raft *r)
+{
+	const struct raft_server *server;
+	int rv;
+
+	assert(r != NULL);
+	assert(r->state == RAFT_FOLLOWER);
+
+	server = configurationGet(&r->configuration, r->id);
+
+	/* If we have been removed from the configuration, or maybe we didn't
+	 * receive one yet, just stay follower. */
+	if (server == NULL) {
+		return 0;
+	}
+
+	/* Check if we need to start an election.
+	 *
+	 * From Section 3.3:
+	 *
+	 *   If a follower receives no communication over a period of time
+	 * called the election timeout, then it assumes there is no viable
+	 * leader and begins an election to choose a new leader.
+	 *
+	 * Figure 3.1:
+	 *
+	 *   If election timeout elapses without receiving AppendEntries RPC
+	 * from current leader or granting vote to candidate, convert to
+	 * candidate.
+	 */
+	if (electionTimerExpired(r) && server->role == RAFT_VOTER) {
+		if (replicationInstallSnapshotBusy(r)) {
+			tracef(
+			    "installing snapshot -> don't convert to "
+			    "candidate");
+			electionResetTimer(r);
+			return 0;
+		}
+		if (r->follower_state.append_in_flight_count > 0) {
+			tracef(
+			    "append in progress -> don't convert to candidate");
+			electionResetTimer(r);
+			return 0;
+		}
+		tracef("convert to candidate and start new election");
+		rv = convertToCandidate(r, false /* disrupt leader */);
+		if (rv != 0) {
+			tracef("convert to candidate: %s", raft_strerror(rv));
+			return rv;
+		}
+	}
+
+	return 0;
+}
+
+/* Apply time-dependent rules for candidates (Figure 3.1). */
+static int tickCandidate(struct raft *r)
+{
+	assert(r != NULL);
+	assert(r->state == RAFT_CANDIDATE);
+
+	/* Check if we need to start an election.
+	 *
+	 * From Section 3.4:
+	 *
+	 *   The third possible outcome is that a candidate neither wins nor
+	 * loses the election: if many followers become candidates at the same
+	 * time, votes could be split so that no candidate obtains a majority.
+	 * When this happens, each candidate will time out and start a new
+	 * election by incrementing its term and initiating another round of
+	 * RequestVote RPCs
+	 */
+	if (electionTimerExpired(r)) {
+		tracef("start new election");
+		return electionStart(r);
+	}
+
+	return 0;
+}
+
+/* Return true if we received an AppendEntries RPC result from a majority of
+ * voting servers since we became leaders or since the last time this function
+ * was called.
+ *
+ * For each server the function checks the recent_recv flag of the associated
+ * progress object, and resets the flag after the check. It returns true if a
+ * majority of voting server had the flag set to true. */
+static bool checkContactQuorum(struct raft *r)
+{
+	unsigned i;
+	unsigned contacts = 0;
+	assert(r->state == RAFT_LEADER);
+
+	for (i = 0; i < r->configuration.n; i++) {
+		struct raft_server *server = &r->configuration.servers[i];
+		bool recent_recv = progressResetRecentRecv(r, i);
+		if ((server->role == RAFT_VOTER && recent_recv) ||
+		    server->id == r->id) {
+			contacts++;
+		}
+	}
+	r->leader_state.voter_contacts = contacts;
+
+	return contacts > configurationVoterCount(&r->configuration) / 2;
+}
+
+/* Apply time-dependent rules for leaders (Figure 3.1). */
+static int tickLeader(struct raft *r)
+{
+	raft_time now = r->io->time(r->io);
+	assert(r->state == RAFT_LEADER);
+
+	/* Check if we still can reach a majority of servers.
+	 *
+	 * From Section 6.2:
+	 *
+	 *   A leader in Raft steps down if an election timeout elapses without
+	 * a successful round of heartbeats to a majority of its cluster; this
+	 *   allows clients to retry their requests with another server.
+	 */
+	if (now - r->election_timer_start >= r->election_timeout) {
+		if (!checkContactQuorum(r)) {
+			tracef(
+			    "unable to contact majority of cluster -> step "
+			    "down");
+			convertToFollower(r);
+			return 0;
+		}
+		r->election_timer_start = r->io->time(r->io);
+	}
+
+	/* Possibly send heartbeats.
+	 *
+	 * From Figure 3.1:
+	 *
+	 *   Send empty AppendEntries RPC during idle periods to prevent
+	 * election timeouts.
+	 */
+	replicationHeartbeat(r);
+
+	/* If a server is being promoted, increment the timer of the current
+	 * round or abort the promotion.
+	 *
+	 * From Section 4.2.1:
+	 *
+	 *   The algorithm waits a fixed number of rounds (such as 10). If the
+	 * last round lasts less than an election timeout, then the leader adds
+	 * the new server to the cluster, under the assumption that there are
+	 * not enough unreplicated entries to create a significant availability
+	 *   gap. Otherwise, the leader aborts the configuration change with an
+	 *   error.
+	 */
+	if (r->leader_state.promotee_id != 0) {
+		raft_id id = r->leader_state.promotee_id;
+		unsigned server_index;
+		raft_time round_duration = now - r->leader_state.round_start;
+		bool is_too_slow;
+		bool is_unresponsive;
+
+		/* If a promotion is in progress, we expect that our
+		 * configuration contains an entry for the server being
+		 * promoted, and that the server is not yet considered as
+		 * voting. */
+		server_index = configurationIndexOf(&r->configuration, id);
+		assert(server_index < r->configuration.n);
+		assert(r->configuration.servers[server_index].role !=
+		       RAFT_VOTER);
+
+		is_too_slow =
+		    (r->leader_state.round_number == r->max_catch_up_rounds &&
+		     round_duration > r->election_timeout);
+		is_unresponsive =
+		    round_duration > r->max_catch_up_round_duration;
+
+		/* Abort the promotion if we are at the 10'th round and it's
+		 * still taking too long, or if the server is unresponsive. */
+		if (is_too_slow || is_unresponsive) {
+			tracef(
+			    "server_index:%d is_too_slow:%d is_unresponsive:%d",
+			    server_index, is_too_slow, is_unresponsive);
+			struct raft_change *change;
+
+			r->leader_state.promotee_id = 0;
+
+			r->leader_state.round_index = 0;
+			r->leader_state.round_number = 0;
+			r->leader_state.round_start = 0;
+
+			change = r->leader_state.change;
+			r->leader_state.change = NULL;
+			if (change != NULL && change->cb != NULL) {
+				change->cb(change, RAFT_NOCONNECTION);
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int tick(struct raft *r)
+{
+	int rv = -1;
+
+	assert(r->state == RAFT_UNAVAILABLE || r->state == RAFT_FOLLOWER ||
+	       r->state == RAFT_CANDIDATE || r->state == RAFT_LEADER);
+
+	/* If we are not available, let's do nothing. */
+	if (r->state == RAFT_UNAVAILABLE) {
+		return 0;
+	}
+
+	switch (r->state) {
+		case RAFT_FOLLOWER:
+			rv = tickFollower(r);
+			break;
+		case RAFT_CANDIDATE:
+			rv = tickCandidate(r);
+			break;
+		case RAFT_LEADER:
+			rv = tickLeader(r);
+			break;
+	}
+
+	return rv;
+}
+
+void tickCb(struct raft_io *io)
+{
+	struct raft *r;
+	int rv;
+	r = io->data;
+	rv = tick(r);
+	if (rv != 0) {
+		convertToUnavailable(r);
+		return;
+	}
+
+	/* For all states: if there is a leadership transfer request in
+	 * progress, check if it's expired. */
+	if (r->transfer != NULL) {
+		raft_time now = r->io->time(r->io);
+		if (now - r->transfer->start >= r->election_timeout) {
+			membershipLeadershipTransferClose(r);
+		}
+	}
+}
+
+#undef tracef
diff --git a/src/raft/tick.h b/src/raft/tick.h
new file mode 100644
index 000000000..ad8751aee
--- /dev/null
+++ b/src/raft/tick.h
@@ -0,0 +1,12 @@
+/* Logic to be invoked periodically. */
+
+#ifndef TICK_H_
+#define TICK_H_
+
+#include "../raft.h"
+
+/* Callback to be passed to the @raft_io implementation. It notifies us that a
+ * certain amount of time has elapsed and will be invoked periodically. */
+void tickCb(struct raft_io *io);
+
+#endif /* TICK_H_ */
diff --git a/src/raft/utils.h b/src/raft/utils.h
new file mode 100644
index 000000000..d01688c87
--- /dev/null
+++ b/src/raft/utils.h
@@ -0,0 +1,17 @@
+#ifndef RAFT_UTILS_H_
+#define RAFT_UTILS_H_
+
+#include <stdint.h>
+
+/* Various utility functions and macros */
+
+#define LIKELY(x) __builtin_expect(!!(x), 1)
+#define UNLIKELY(x) __builtin_expect(!!(x), 0)
+
+#define DBG() fprintf(stderr, "%s:%d\n", __func__, __LINE__)
+
+#define UNUSED __attribute__((unused))
+
+#define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof(a)[0]))
+
+#endif /* RAFT_UTILS_H_ */
diff --git a/src/raft/uv.c b/src/raft/uv.c
new file mode 100644
index 000000000..c0602c5d3
--- /dev/null
+++ b/src/raft/uv.c
@@ -0,0 +1,815 @@
+#include "../raft.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/random.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../raft.h"
+#include "../tracing.h"
+#include "assert.h"
+#include "byte.h"
+#include "configuration.h"
+#include "entry.h"
+#include "heap.h"
+#include "snapshot.h"
+#include "uv.h"
+#include "uv_encoding.h"
+#include "uv_os.h"
+
+/* Retry to connect to peer servers every second.
+ *
+ * TODO: implement an exponential backoff instead.  */
+#define CONNECT_RETRY_DELAY 1000
+
+/* Cleans up files that are no longer used by the system */
+static int uvMaintenance(const char *dir, char *errmsg)
+{
+	struct uv_fs_s req;
+	struct uv_dirent_s entry;
+	int n;
+	int i;
+	int rv;
+	int rv2;
+
+	n = uv_fs_scandir(NULL, &req, dir, 0, NULL);
+	if (n < 0) {
+		ErrMsgPrintf(errmsg, "scan data directory: %s", uv_strerror(n));
+		return RAFT_IOERR;
+	}
+
+	rv = 0;
+	for (i = 0; i < n; i++) {
+		const char *filename;
+		rv = uv_fs_scandir_next(&req, &entry);
+		assert(rv == 0); /* Can't fail in libuv */
+
+		filename = entry.name;
+		/* Remove leftover tmp-files */
+		if (strncmp(filename, TMP_FILE_PREFIX,
+			    strlen(TMP_FILE_PREFIX)) == 0) {
+			UvFsRemoveFile(dir, filename,
+				       errmsg); /* Ignore errors */
+			continue;
+		}
+
+		/* Remove orphaned snapshot files */
+		bool orphan = false;
+		if ((UvSnapshotIsOrphan(dir, filename, &orphan) == 0) &&
+		    orphan) {
+			UvFsRemoveFile(dir, filename,
+				       errmsg); /* Ignore errors */
+			continue;
+		}
+
+		/* Remove orphaned snapshot metadata files */
+		if ((UvSnapshotMetaIsOrphan(dir, filename, &orphan) == 0) &&
+		    orphan) {
+			UvFsRemoveFile(dir, filename,
+				       errmsg); /* Ignore errors */
+		}
+	}
+
+	rv2 = uv_fs_scandir_next(&req, &entry);
+	assert(rv2 == UV_EOF);
+	return rv;
+}
+
+/* Implementation of raft_io->config. */
+static int uvInit(struct raft_io *io, raft_id id, const char *address)
+{
+	struct uv *uv;
+	size_t direct_io;
+	struct uvMetadata metadata;
+	int rv;
+	uv = io->impl;
+	uv->id = id;
+
+	rv = UvFsCheckDir(uv->dir, io->errmsg);
+	if (rv != 0) {
+		return rv;
+	}
+
+	/* Probe file system capabilities */
+	rv = UvFsProbeCapabilities(uv->dir, &direct_io, &uv->async_io,
+				   &uv->fallocate, io->errmsg);
+	if (rv != 0) {
+		return rv;
+	}
+	uv->direct_io = direct_io != 0;
+	uv->block_size = direct_io != 0 ? direct_io : 4096;
+
+	rv = uvMaintenance(uv->dir, io->errmsg);
+	if (rv != 0) {
+		return rv;
+	}
+
+	rv = uvMetadataLoad(uv->dir, &metadata, io->errmsg);
+	if (rv != 0) {
+		return rv;
+	}
+	uv->metadata = metadata;
+
+	rv = uv->transport->init(uv->transport, id, address);
+	if (rv != 0) {
+		ErrMsgTransfer(uv->transport->errmsg, io->errmsg, "transport");
+		return rv;
+	}
+	uv->transport->data = uv;
+
+	rv = uv_timer_init(uv->loop, &uv->timer);
+	assert(rv == 0); /* This should never fail */
+	uv->timer.data = uv;
+
+	return 0;
+}
+
+/* Periodic timer callback */
+static void uvTickTimerCb(uv_timer_t *timer)
+{
+	struct uv *uv;
+	uv = timer->data;
+	if (uv->tick_cb != NULL) {
+		uv->tick_cb(uv->io);
+	}
+}
+
+/* Implementation of raft_io->start. */
+static int uvStart(struct raft_io *io,
+		   unsigned msecs,
+		   raft_io_tick_cb tick_cb,
+		   raft_io_recv_cb recv_cb)
+{
+	struct uv *uv;
+	int rv;
+	uv = io->impl;
+	uv->state = UV__ACTIVE;
+	uv->tick_cb = tick_cb;
+	uv->recv_cb = recv_cb;
+	rv = UvRecvStart(uv);
+	if (rv != 0) {
+		return rv;
+	}
+	rv = uv_timer_start(&uv->timer, uvTickTimerCb, msecs, msecs);
+	assert(rv == 0);
+	return 0;
+}
+
+void uvMaybeFireCloseCb(struct uv *uv)
+{
+	tracef("uv maybe fire close cb");
+	if (!uv->closing) {
+		return;
+	}
+
+	if (uv->transport->data != NULL) {
+		return;
+	}
+	if (uv->timer.data != NULL) {
+		return;
+	}
+	if (!QUEUE_IS_EMPTY(&uv->append_segments)) {
+		return;
+	}
+	if (!QUEUE_IS_EMPTY(&uv->finalize_reqs)) {
+		return;
+	}
+	if (uv->finalize_work.data != NULL) {
+		return;
+	}
+	if (uv->prepare_inflight != NULL) {
+		return;
+	}
+	if (uv->barrier != NULL) {
+		return;
+	}
+	if (uv->snapshot_put_work.data != NULL) {
+		return;
+	}
+	if (!QUEUE_IS_EMPTY(&uv->snapshot_get_reqs)) {
+		return;
+	}
+	if (!QUEUE_IS_EMPTY(&uv->async_work_reqs)) {
+		return;
+	}
+	if (!QUEUE_IS_EMPTY(&uv->aborting)) {
+		return;
+	}
+
+	assert(uv->truncate_work.data == NULL);
+
+	if (uv->close_cb != NULL) {
+		uv->close_cb(uv->io);
+	}
+}
+
+static void uvTickTimerCloseCb(uv_handle_t *handle)
+{
+	struct uv *uv = handle->data;
+	assert(uv->closing);
+	uv->timer.data = NULL;
+	uvMaybeFireCloseCb(uv);
+}
+
+static void uvTransportCloseCb(struct raft_uv_transport *transport)
+{
+	struct uv *uv = transport->data;
+	assert(uv->closing);
+	uv->transport->data = NULL;
+	uvMaybeFireCloseCb(uv);
+}
+
+/* Implementation of raft_io->close. */
+static void uvClose(struct raft_io *io, raft_io_close_cb cb)
+{
+	struct uv *uv;
+	uv = io->impl;
+	assert(uv != NULL);
+	assert(!uv->closing);
+	uv->close_cb = cb;
+	uv->closing = true;
+	UvSendClose(uv);
+	UvRecvClose(uv);
+	uvAppendClose(uv);
+	if (uv->transport->data != NULL) {
+		uv->transport->close(uv->transport, uvTransportCloseCb);
+	}
+	if (uv->timer.data != NULL) {
+		uv_close((uv_handle_t *)&uv->timer, uvTickTimerCloseCb);
+	}
+	uvMaybeFireCloseCb(uv);
+}
+
+/* Filter the given segment list to find the most recent contiguous chunk of
+ * closed segments that overlaps with the given snapshot last index. */
+static int uvFilterSegments(struct uv *uv,
+			    raft_index last_index,
+			    const char *snapshot_filename,
+			    struct uvSegmentInfo **segments,
+			    size_t *n)
+{
+	struct uvSegmentInfo *segment;
+	size_t i; /* First valid closed segment. */
+	size_t j; /* Last valid closed segment. */
+
+	/* If there are not segments at all, or only open segments, there's
+	 * nothing to do. */
+	if (*segments == NULL || (*segments)[0].is_open) {
+		return 0;
+	}
+
+	/* Find the index of the most recent closed segment. */
+	for (j = 0; j < *n; j++) {
+		segment = &(*segments)[j];
+		if (segment->is_open) {
+			break;
+		}
+	}
+	assert(j > 0);
+	j--;
+
+	segment = &(*segments)[j];
+	tracef("most recent closed segment is %s", segment->filename);
+
+	/* If the end index of the last closed segment is lower than the last
+	 * snapshot index, there might be no entry that we can keep. We return
+	 * an empty segment list, unless there is at least one open segment, in
+	 * that case we keep everything hoping that they contain all the entries
+	 * since the last closed segment (TODO: we should encode the starting
+	 * entry in the open segment). */
+	if (segment->end_index < last_index) {
+		if (!(*segments)[*n - 1].is_open) {
+			tracef(
+			    "discarding all closed segments, since most recent "
+			    "is behind "
+			    "last snapshot");
+			raft_free(*segments);
+			*segments = NULL;
+			*n = 0;
+			return 0;
+		}
+		tracef(
+		    "most recent closed segment %s is behind last snapshot, "
+		    "yet there are open segments",
+		    segment->filename);
+	}
+
+	/* Now scan the segments backwards, searching for the longest list of
+	 * contiguous closed segments. */
+	if (j >= 1) {
+		for (i = j; i > 0; i--) {
+			struct uvSegmentInfo *newer;
+			struct uvSegmentInfo *older;
+			newer = &(*segments)[i];
+			older = &(*segments)[i - 1];
+			if (older->end_index != newer->first_index - 1) {
+				tracef("discarding non contiguous segment %s",
+				       older->filename);
+				break;
+			}
+		}
+	} else {
+		i = j;
+	}
+
+	/* Make sure that the first index of the first valid closed segment is
+	 * not greater than the snapshot's last index plus one (so there are no
+	 * missing entries). */
+	segment = &(*segments)[i];
+	if (segment->first_index > last_index + 1) {
+		ErrMsgPrintf(uv->io->errmsg,
+			     "closed segment %s is past last snapshot %s",
+			     segment->filename, snapshot_filename);
+		return RAFT_CORRUPT;
+	}
+
+	if (i != 0) {
+		size_t new_n = *n - i;
+		struct uvSegmentInfo *new_segments;
+		new_segments = raft_malloc(new_n * sizeof *new_segments);
+		if (new_segments == NULL) {
+			return RAFT_NOMEM;
+		}
+		memcpy(new_segments, &(*segments)[i],
+		       new_n * sizeof *new_segments);
+		raft_free(*segments);
+		*segments = new_segments;
+		*n = new_n;
+	}
+
+	return 0;
+}
+
+/* Load the last snapshot (if any) and all entries contained in all segment
+ * files of the data directory. This function can be called recursively, `depth`
+ * is there to ensure we don't get stuck in a recursive loop. */
+static int uvLoadSnapshotAndEntries(struct uv *uv,
+				    struct raft_snapshot **snapshot,
+				    raft_index *start_index,
+				    struct raft_entry *entries[],
+				    size_t *n,
+				    int depth)
+{
+	struct uvSnapshotInfo *snapshots;
+	struct uvSegmentInfo *segments;
+	size_t n_snapshots;
+	size_t n_segments;
+	int rv;
+
+	*snapshot = NULL;
+	*start_index = 1;
+	*entries = NULL;
+	*n = 0;
+
+	/* List available snapshots and segments. */
+	rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments,
+		    uv->io->errmsg);
+	if (rv != 0) {
+		goto err;
+	}
+
+	/* Load the most recent snapshot, if any. */
+	if (snapshots != NULL) {
+		char snapshot_filename[UV__FILENAME_LEN];
+		*snapshot = RaftHeapMalloc(sizeof **snapshot);
+		if (*snapshot == NULL) {
+			rv = RAFT_NOMEM;
+			goto err;
+		}
+		rv = UvSnapshotLoad(uv, &snapshots[n_snapshots - 1], *snapshot,
+				    uv->io->errmsg);
+		if (rv != 0) {
+			RaftHeapFree(*snapshot);
+			*snapshot = NULL;
+			goto err;
+		}
+		uvSnapshotFilenameOf(&snapshots[n_snapshots - 1],
+				     snapshot_filename);
+		tracef("most recent snapshot at %lld", (*snapshot)->index);
+		RaftHeapFree(snapshots);
+		snapshots = NULL;
+
+		/* Update the start index. If there are closed segments on disk
+		 * let's make sure that the first index of the first closed
+		 * segment is not greater than the snapshot's last index plus
+		 * one (so there are no missing entries), and update the start
+		 * index accordingly. */
+		rv = uvFilterSegments(uv, (*snapshot)->index, snapshot_filename,
+				      &segments, &n_segments);
+		if (rv != 0) {
+			goto err;
+		}
+		if (segments != NULL) {
+			if (segments[0].is_open) {
+				*start_index = (*snapshot)->index + 1;
+			} else {
+				*start_index = segments[0].first_index;
+			}
+		} else {
+			*start_index = (*snapshot)->index + 1;
+		}
+	}
+
+	/* Read data from segments, closing any open segments. */
+	if (segments != NULL) {
+		raft_index last_index;
+		rv = uvSegmentLoadAll(uv, *start_index, segments, n_segments,
+				      entries, n);
+		if (rv != 0) {
+			goto err;
+		}
+
+		/* Check if all entries that we loaded are actually behind the
+		 * last snapshot. This can happen if the last closed segment was
+		 * behind the last snapshot and there were open segments, but
+		 * the entries in the open segments turned out to be behind the
+		 * snapshot as well.  */
+		last_index = *start_index + *n - 1;
+		if (*snapshot != NULL && last_index < (*snapshot)->index) {
+			ErrMsgPrintf(uv->io->errmsg,
+				     "last entry on disk has index %llu, which "
+				     "is behind "
+				     "last snapshot's index %llu",
+				     last_index, (*snapshot)->index);
+			rv = RAFT_CORRUPT;
+			goto err;
+		}
+
+		raft_free(segments);
+		segments = NULL;
+	}
+
+	return 0;
+
+err:
+	assert(rv != 0);
+	if (*snapshot != NULL) {
+		snapshotDestroy(*snapshot);
+		*snapshot = NULL;
+	}
+	if (snapshots != NULL) {
+		raft_free(snapshots);
+	}
+	if (segments != NULL) {
+		raft_free(segments);
+	}
+	if (*entries != NULL) {
+		entryBatchesDestroy(*entries, *n);
+		*entries = NULL;
+		*n = 0;
+	}
+	/* Try to recover exactly once when corruption is detected, the first
+	 * pass might have cleaned up corrupt data. Most of the arguments are
+	 * already reset after the `err` label, except for `start_index`. */
+	if (rv == RAFT_CORRUPT && uv->auto_recovery && depth == 0) {
+		*start_index = 1;
+		return uvLoadSnapshotAndEntries(uv, snapshot, start_index,
+						entries, n, depth + 1);
+	}
+	return rv;
+}
+
+/* Implementation of raft_io->load. */
+static int uvLoad(struct raft_io *io,
+		  raft_term *term,
+		  raft_id *voted_for,
+		  struct raft_snapshot **snapshot,
+		  raft_index *start_index,
+		  struct raft_entry **entries,
+		  size_t *n_entries)
+{
+	struct uv *uv;
+	int rv;
+	uv = io->impl;
+
+	*term = uv->metadata.term;
+	*voted_for = uv->metadata.voted_for;
+	*snapshot = NULL;
+
+	rv = uvLoadSnapshotAndEntries(uv, snapshot, start_index, entries,
+				      n_entries, 0);
+	if (rv != 0) {
+		return rv;
+	}
+	tracef("start index %lld, %zu entries", *start_index, *n_entries);
+	if (*snapshot == NULL) {
+		tracef("no snapshot");
+	}
+
+	/* Set the index of the next entry that will be appended. */
+	uv->append_next_index = *start_index + *n_entries;
+
+	return 0;
+}
+
+/* Implementation of raft_io->set_term. */
+static int uvSetTerm(struct raft_io *io, const raft_term term)
+{
+	struct uv *uv;
+	int rv;
+	uv = io->impl;
+	uv->metadata.version++;
+	uv->metadata.term = term;
+	uv->metadata.voted_for = 0;
+	rv = uvMetadataStore(uv, &uv->metadata);
+	if (rv != 0) {
+		return rv;
+	}
+	return 0;
+}
+
+/* Implementation of raft_io->set_term. */
+static int uvSetVote(struct raft_io *io, const raft_id server_id)
+{
+	struct uv *uv;
+	int rv;
+	uv = io->impl;
+	uv->metadata.version++;
+	uv->metadata.voted_for = server_id;
+	rv = uvMetadataStore(uv, &uv->metadata);
+	if (rv != 0) {
+		return rv;
+	}
+	return 0;
+}
+
+/* Implementation of raft_io->bootstrap. */
+static int uvBootstrap(struct raft_io *io,
+		       const struct raft_configuration *configuration)
+{
+	struct uv *uv;
+	int rv;
+	uv = io->impl;
+
+	/* We shouldn't have written anything else yet. */
+	if (uv->metadata.term != 0) {
+		ErrMsgPrintf(io->errmsg, "metadata contains term %lld",
+			     uv->metadata.term);
+		return RAFT_CANTBOOTSTRAP;
+	}
+
+	/* Write the term */
+	rv = uvSetTerm(io, 1);
+	if (rv != 0) {
+		return rv;
+	}
+
+	/* Create the first closed segment file, containing just one entry. */
+	rv = uvSegmentCreateFirstClosed(uv, configuration);
+	if (rv != 0) {
+		return rv;
+	}
+
+	return 0;
+}
+
+/* Implementation of raft_io->recover. */
+static int uvRecover(struct raft_io *io, const struct raft_configuration *conf)
+{
+	struct uv *uv = io->impl;
+	struct raft_snapshot *snapshot;
+	raft_index start_index;
+	raft_index next_index;
+	struct raft_entry *entries;
+	size_t n_entries;
+	int rv;
+
+	/* Load the current state. This also closes any leftover open segment.
+	 */
+	rv = uvLoadSnapshotAndEntries(uv, &snapshot, &start_index, &entries,
+				      &n_entries, 0);
+	if (rv != 0) {
+		return rv;
+	}
+
+	/* We don't care about the actual data, just index of the last entry. */
+	if (snapshot != NULL) {
+		snapshotDestroy(snapshot);
+	}
+	if (entries != NULL) {
+		entryBatchesDestroy(entries, n_entries);
+	}
+
+	assert(start_index > 0);
+	next_index = start_index + n_entries;
+
+	rv = uvSegmentCreateClosedWithConfiguration(uv, next_index, conf);
+	if (rv != 0) {
+		return rv;
+	}
+
+	return 0;
+}
+
+/* Implementation of raft_io->time. */
+static raft_time uvTime(struct raft_io *io)
+{
+	struct uv *uv;
+	uv = io->impl;
+	return uv_now(uv->loop);
+}
+
+/* Implementation of raft_io->random. */
+static int uvRandom(struct raft_io *io, int min, int max)
+{
+	(void)io;
+	return min + (abs(rand()) % (max - min));
+}
+
+static void uvSeedRand(struct uv *uv)
+{
+	ssize_t sz = -1;
+	unsigned seed = 0; /* fed to srand() */
+
+	sz = getrandom(&seed, sizeof seed, GRND_NONBLOCK);
+	if (sz == -1 || sz < ((ssize_t)sizeof seed)) {
+		/* Fall back to an inferior random seed when `getrandom` would
+		 * have blocked or when not enough randomness was returned. */
+		seed ^= (unsigned)uv->id;
+		seed ^= (unsigned)uv_now(uv->loop);
+		struct timeval time = {0};
+		/* Ignore errors. */
+		gettimeofday(&time, NULL);
+		seed ^=
+		    (unsigned)((time.tv_sec * 1000) + (time.tv_usec / 1000));
+	}
+
+	srand(seed);
+}
+
+int raft_uv_init(struct raft_io *io,
+		 struct uv_loop_s *loop,
+		 const char *dir,
+		 struct raft_uv_transport *transport)
+{
+	struct uv *uv;
+	void *data;
+	int rv;
+
+	assert(io != NULL);
+	assert(loop != NULL);
+	assert(dir != NULL);
+	assert(transport != NULL);
+
+	data = io->data;
+	memset(io, 0, sizeof *io);
+	io->data = data;
+
+	if (transport->version == 0) {
+		ErrMsgPrintf(io->errmsg, "transport->version must be set");
+		return RAFT_INVALID;
+	}
+
+	/* Ensure that the given path doesn't exceed our static buffer limit. */
+	if (!UV__DIR_HAS_VALID_LEN(dir)) {
+		ErrMsgPrintf(io->errmsg, "directory path too long");
+		return RAFT_NAMETOOLONG;
+	}
+
+	/* Allocate the raft_io_uv object */
+	uv = raft_malloc(sizeof *uv);
+	if (uv == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+	memset(uv, 0, sizeof(struct uv));
+
+	uv->io = io;
+	uv->loop = loop;
+	strncpy(uv->dir, dir, sizeof(uv->dir) - 1);
+	uv->dir[sizeof(uv->dir) - 1] = '\0';
+	uv->transport = transport;
+	uv->transport->data = NULL;
+	uv->tracer = NULL;
+	uv->id = 0; /* Set by raft_io->config() */
+	uv->state = UV__PRISTINE;
+	uv->errored = false;
+	uv->direct_io = false;
+	uv->async_io = false;
+	uv->fallocate = false;
+#ifdef LZ4_ENABLED
+	uv->snapshot_compression = true;
+#else
+	uv->snapshot_compression = false;
+#endif
+	uv->segment_size = UV__MAX_SEGMENT_SIZE;
+	uv->block_size = 0;
+	QUEUE_INIT(&uv->clients);
+	QUEUE_INIT(&uv->servers);
+	uv->connect_retry_delay = CONNECT_RETRY_DELAY;
+	uv->prepare_inflight = NULL;
+	QUEUE_INIT(&uv->prepare_reqs);
+	QUEUE_INIT(&uv->prepare_pool);
+	uv->prepare_next_counter = 1;
+	uv->append_next_index = 1;
+	QUEUE_INIT(&uv->append_segments);
+	QUEUE_INIT(&uv->append_pending_reqs);
+	QUEUE_INIT(&uv->append_writing_reqs);
+	uv->barrier = NULL;
+	QUEUE_INIT(&uv->finalize_reqs);
+	uv->finalize_work.data = NULL;
+	uv->truncate_work.data = NULL;
+	QUEUE_INIT(&uv->snapshot_get_reqs);
+	QUEUE_INIT(&uv->async_work_reqs);
+	uv->snapshot_put_work.data = NULL;
+	uv->timer.data = NULL;
+	uv->tick_cb = NULL; /* Set by raft_io->start() */
+	uv->recv_cb = NULL; /* Set by raft_io->start() */
+	QUEUE_INIT(&uv->aborting);
+	uv->closing = false;
+	uv->close_cb = NULL;
+	uv->auto_recovery = true;
+
+	uvSeedRand(uv);
+
+	/* Set the raft_io implementation. */
+	io->version = 2; /* future-proof'ing */
+	io->impl = uv;
+	io->init = uvInit;
+	io->close = uvClose;
+	io->start = uvStart;
+	io->load = uvLoad;
+	io->bootstrap = uvBootstrap;
+	io->recover = uvRecover;
+	io->set_term = uvSetTerm;
+	io->set_vote = uvSetVote;
+	io->append = UvAppend;
+	io->truncate = UvTruncate;
+	io->send = UvSend;
+	io->snapshot_put = UvSnapshotPut;
+	io->snapshot_get = UvSnapshotGet;
+	io->async_work = UvAsyncWork;
+	io->time = uvTime;
+	io->random = uvRandom;
+
+	return 0;
+
+err:
+	assert(rv != 0);
+	if (rv == RAFT_NOMEM) {
+		ErrMsgOom(io->errmsg);
+	}
+	return rv;
+}
+
+void raft_uv_close(struct raft_io *io)
+{
+	struct uv *uv;
+	uv = io->impl;
+	io->impl = NULL;
+	raft_free(uv);
+}
+
+void raft_uv_set_segment_size(struct raft_io *io, size_t size)
+{
+	struct uv *uv;
+	uv = io->impl;
+	uv->segment_size = size;
+}
+
+void raft_uv_set_block_size(struct raft_io *io, size_t size)
+{
+	struct uv *uv;
+	uv = io->impl;
+	uv->block_size = size;
+}
+
+int raft_uv_set_snapshot_compression(struct raft_io *io, bool compressed)
+{
+	struct uv *uv;
+	uv = io->impl;
+#ifndef LZ4_AVAILABLE
+	if (compressed) {
+		return RAFT_INVALID;
+	}
+#endif
+	uv->snapshot_compression = compressed;
+	return 0;
+}
+
+void raft_uv_set_connect_retry_delay(struct raft_io *io, unsigned msecs)
+{
+	struct uv *uv;
+	uv = io->impl;
+	uv->connect_retry_delay = msecs;
+}
+
+void raft_uv_set_tracer(struct raft_io *io, struct raft_tracer *tracer)
+{
+	struct uv *uv;
+	uv = io->impl;
+	uv->tracer = tracer;
+}
+
+void raft_uv_set_auto_recovery(struct raft_io *io, bool flag)
+{
+	struct uv *uv;
+	uv = io->impl;
+	uv->auto_recovery = flag;
+}
+
+#undef tracef
diff --git a/src/raft/uv.h b/src/raft/uv.h
new file mode 100644
index 000000000..db4009c64
--- /dev/null
+++ b/src/raft/uv.h
@@ -0,0 +1,422 @@
+/* Implementation of the @raft_io interface based on libuv. */
+
+#ifndef UV_H_
+#define UV_H_
+
+#include "../raft.h"
+#include "../tracing.h"
+#include "err.h"
+#include "queue.h"
+#include "uv_fs.h"
+#include "uv_os.h"
+
+/* 8 Megabytes */
+#define UV__MAX_SEGMENT_SIZE (8 * 1024 * 1024)
+
+/* Template string for closed segment filenames: start index (inclusive), end
+ * index (inclusive). */
+#define UV__CLOSED_TEMPLATE "%016llu-%016llu"
+
+/* Template string for open segment filenames: incrementing counter. */
+#define UV__OPEN_TEMPLATE "open-%llu"
+
+/* Enough to hold a segment filename (either open or closed) */
+#define UV__SEGMENT_FILENAME_BUF_SIZE 34
+
+/* Template string for snapshot filenames: snapshot term, snapshot index,
+ * creation timestamp (milliseconds since epoch). */
+#define UV__SNAPSHOT_TEMPLATE "snapshot-%llu-%llu-%llu"
+
+#define UV__SNAPSHOT_META_SUFFIX ".meta"
+
+/* Template string for snapshot metadata filenames: snapshot term,  snapshot
+ * index, creation timestamp (milliseconds since epoch). */
+#define UV__SNAPSHOT_META_TEMPLATE \
+	UV__SNAPSHOT_TEMPLATE UV__SNAPSHOT_META_SUFFIX
+
+/* State codes. */
+enum {
+	UV__PRISTINE, /* Metadata cache populated and I/O capabilities probed */
+	UV__ACTIVE,
+	UV__CLOSED
+};
+
+/* Open segment counter type */
+typedef unsigned long long uvCounter;
+
+/* Information persisted in a single metadata file. */
+struct uvMetadata
+{
+	unsigned long long version; /* Monotonically increasing version */
+	raft_term term;             /* Current term */
+	raft_id voted_for;          /* Server ID of last vote, or 0 */
+};
+
+/* Hold state of a libuv-based raft_io implementation. */
+struct uv
+{
+	struct raft_io *io;                  /* I/O object we're implementing */
+	struct uv_loop_s *loop;              /* UV event loop */
+	char dir[UV__DIR_LEN];               /* Data directory */
+	struct raft_uv_transport *transport; /* Network transport */
+	struct raft_tracer *tracer;          /* Debug tracing */
+	raft_id id;                          /* Server ID */
+	int state;                           /* Current state */
+	bool snapshot_compression;           /* If compression is enabled */
+	bool errored;                        /* If a disk I/O error was hit */
+	bool direct_io;                 /* Whether direct I/O is supported */
+	bool async_io;                  /* Whether async I/O is supported */
+	bool fallocate;                 /* Whether fallocate is supported */
+	size_t segment_size;            /* Initial size of open segments. */
+	size_t block_size;              /* Block size of the data dir */
+	queue clients;                  /* Outbound connections */
+	queue servers;                  /* Inbound connections */
+	unsigned connect_retry_delay;   /* Client connection retry delay */
+	void *prepare_inflight;         /* Segment being prepared */
+	queue prepare_reqs;             /* Pending prepare requests. */
+	queue prepare_pool;             /* Prepared open segments */
+	uvCounter prepare_next_counter; /* Counter of next open segment */
+	raft_index append_next_index;   /* Index of next entry to append */
+	queue append_segments;          /* Open segments in use. */
+	queue append_pending_reqs;      /* Pending append requests. */
+	queue append_writing_reqs;      /* Append requests in flight */
+	struct UvBarrier *barrier;      /* Inflight barrier request */
+	queue finalize_reqs;            /* Segments waiting to be closed */
+	struct uv_work_s finalize_work; /* Resize and rename segments */
+	struct uv_work_s truncate_work; /* Execute truncate log requests */
+	queue snapshot_get_reqs;        /* Inflight get snapshot requests */
+	queue async_work_reqs;          /* Inflight async work requests */
+	struct uv_work_s snapshot_put_work; /* Execute snapshot put requests */
+	struct uvMetadata metadata;         /* Cache of metadata on disk */
+	struct uv_timer_s timer;            /* Timer for periodic ticks */
+	raft_io_tick_cb tick_cb;            /* Invoked when the timer expires */
+	raft_io_recv_cb recv_cb;            /* Invoked when upon RPC messages */
+	queue aborting;            /* Cleanups upon errors or shutdown */
+	bool closing;              /* True if we are closing */
+	raft_io_close_cb close_cb; /* Invoked when finishing closing */
+	bool auto_recovery;        /* Try to recover from corrupt segments */
+};
+
+/* Implementation of raft_io->truncate. */
+int UvTruncate(struct raft_io *io, raft_index index);
+
+/* Load Raft metadata from disk, choosing the most recent version (either the
+ * metadata1 or metadata2 file). */
+int uvMetadataLoad(const char *dir, struct uvMetadata *metadata, char *errmsg);
+
+/* Store the given metadata to disk, writing the appropriate metadata file
+ * according to the metadata version (if the version is odd, write metadata1,
+ * otherwise write metadata2). */
+int uvMetadataStore(struct uv *uv, const struct uvMetadata *metadata);
+
+/* Metadata about a segment file. */
+struct uvSegmentInfo
+{
+	bool is_open; /* Whether the segment is open */
+	union {
+		struct
+		{
+			raft_index
+			    first_index; /* First index in a closed segment */
+			raft_index
+			    end_index; /* Last index in a closed segment */
+		};
+		struct
+		{
+			unsigned long long counter; /* Open segment counter */
+		};
+	};
+	char filename[UV__SEGMENT_FILENAME_BUF_SIZE]; /* Segment filename */
+};
+
+/* Append a new item to the given segment info list if the given filename
+ * matches either the one of a closed segment (xxx-yyy) or the one of an open
+ * segment (open-xxx). */
+int uvSegmentInfoAppendIfMatch(const char *filename,
+			       struct uvSegmentInfo *infos[],
+			       size_t *n_infos,
+			       bool *appended);
+
+/* Sort the given list of segments by comparing their filenames. Closed segments
+ * come before open segments. */
+void uvSegmentSort(struct uvSegmentInfo *infos, size_t n_infos);
+
+/* Keep only the closed segments whose entries are within the given trailing
+ * amount past the given snapshot last index. If the given trailing amount is 0,
+ * unconditionally delete all closed segments. */
+int uvSegmentKeepTrailing(struct uv *uv,
+			  struct uvSegmentInfo *segments,
+			  size_t n,
+			  raft_index last_index,
+			  size_t trailing,
+			  char *errmsg);
+
+/* Load all entries contained in the given closed segment. */
+int uvSegmentLoadClosed(struct uv *uv,
+			struct uvSegmentInfo *segment,
+			struct raft_entry *entries[],
+			size_t *n);
+
+/* Load raft entries from the given segments. The @start_index is the expected
+ * index of the first entry of the first segment. */
+int uvSegmentLoadAll(struct uv *uv,
+		     const raft_index start_index,
+		     struct uvSegmentInfo *segments,
+		     size_t n_segments,
+		     struct raft_entry **entries,
+		     size_t *n_entries);
+
+/* Return the number of blocks in a segments. */
+#define uvSegmentBlocks(UV) (UV->segment_size / UV->block_size)
+
+/* A dynamically allocated buffer holding data to be written into a segment
+ * file.
+ *
+ * The memory is aligned at disk block boundary, to allow for direct I/O. */
+struct uvSegmentBuffer
+{
+	size_t block_size; /* Disk block size for direct I/O */
+	uv_buf_t arena;    /* Previously allocated memory that can be re-used */
+	size_t n;          /* Write offset */
+};
+
+/* Initialize an empty buffer. */
+void uvSegmentBufferInit(struct uvSegmentBuffer *b, size_t block_size);
+
+/* Release all memory used by the buffer. */
+void uvSegmentBufferClose(struct uvSegmentBuffer *b);
+
+/* Encode the format version at the very beginning of the buffer. This function
+ * must be called when the buffer is empty. */
+int uvSegmentBufferFormat(struct uvSegmentBuffer *b);
+
+/* Extend the segment's buffer by encoding the given entries.
+ *
+ * Previous data in the buffer will be retained, and data for these new entries
+ * will be appended. */
+int uvSegmentBufferAppend(struct uvSegmentBuffer *b,
+			  const struct raft_entry entries[],
+			  unsigned n_entries);
+
+/* After all entries to write have been encoded, finalize the buffer by zeroing
+ * the unused memory of the last block. The out parameter will point to the
+ * memory to write. */
+void uvSegmentBufferFinalize(struct uvSegmentBuffer *b, uv_buf_t *out);
+
+/* Reset the buffer preparing it for the next segment write.
+ *
+ * If the retain parameter is greater than zero, then the data of the retain'th
+ * block will be copied at the beginning of the buffer and the write offset will
+ * be set accordingly. */
+void uvSegmentBufferReset(struct uvSegmentBuffer *b, unsigned retain);
+
+/* Write a closed segment, containing just one entry at the given index
+ * for the given configuration. */
+int uvSegmentCreateClosedWithConfiguration(
+    struct uv *uv,
+    raft_index index,
+    const struct raft_configuration *configuration);
+
+/* Write the first closed segment, containing just one entry for the given
+ * configuration. */
+int uvSegmentCreateFirstClosed(struct uv *uv,
+			       const struct raft_configuration *configuration);
+
+/* Truncate a segment that was already closed. */
+int uvSegmentTruncate(struct uv *uv,
+		      struct uvSegmentInfo *segment,
+		      raft_index index);
+
+/* Info about a persisted snapshot stored in snapshot metadata file. */
+struct uvSnapshotInfo
+{
+	raft_term term;
+	raft_index index;
+	unsigned long long timestamp;
+	char filename[UV__FILENAME_LEN];
+};
+
+/* Render the filename of the data file of a snapshot */
+void uvSnapshotFilenameOf(struct uvSnapshotInfo *info, char *filename);
+
+/* Upon success `orphan` will be true if filename is a snapshot file without a
+ * sibling .meta file */
+int UvSnapshotIsOrphan(const char *dir, const char *filename, bool *orphan);
+
+/* Upon success `orphan` will be true if filename is a snapshot .meta file
+ * without a sibling snapshot file */
+int UvSnapshotMetaIsOrphan(const char *dir, const char *filename, bool *orphan);
+
+/* Append a new item to the given snapshot info list if the given filename
+ * matches the pattern of a snapshot metadata file (snapshot-xxx-yyy-zzz.meta)
+ * and there is actually a matching non-empty snapshot file on disk. */
+int UvSnapshotInfoAppendIfMatch(struct uv *uv,
+				const char *filename,
+				struct uvSnapshotInfo *infos[],
+				size_t *n_infos,
+				bool *appended);
+
+/* Sort the given list of snapshots by comparing their filenames. Older
+ * snapshots will come first. */
+void UvSnapshotSort(struct uvSnapshotInfo *infos, size_t n_infos);
+
+/* Load the snapshot associated with the given metadata. */
+int UvSnapshotLoad(struct uv *uv,
+		   struct uvSnapshotInfo *meta,
+		   struct raft_snapshot *snapshot,
+		   char *errmsg);
+
+/* Implementation raft_io->snapshot_put (defined in uv_snapshot.c). */
+int UvSnapshotPut(struct raft_io *io,
+		  unsigned trailing,
+		  struct raft_io_snapshot_put *req,
+		  const struct raft_snapshot *snapshot,
+		  raft_io_snapshot_put_cb cb);
+
+/* Implementation of raft_io->snapshot_get (defined in uv_snapshot.c). */
+int UvSnapshotGet(struct raft_io *io,
+		  struct raft_io_snapshot_get *req,
+		  raft_io_snapshot_get_cb cb);
+
+/* Implementation of raft_io->async_work (defined in uv_work.c). */
+int UvAsyncWork(struct raft_io *io,
+		struct raft_io_async_work *req,
+		raft_io_async_work_cb cb);
+
+/* Return a list of all snapshots and segments found in the data directory. Both
+ * snapshots and segments are ordered by filename (closed segments come before
+ * open ones). */
+int UvList(struct uv *uv,
+	   struct uvSnapshotInfo *snapshots[],
+	   size_t *n_snapshots,
+	   struct uvSegmentInfo *segments[],
+	   size_t *n_segments,
+	   char *errmsg);
+
+/* Request to obtain a newly prepared open segment. */
+struct uvPrepare;
+typedef void (*uvPrepareCb)(struct uvPrepare *req, int status);
+struct uvPrepare
+{
+	void *data;                 /* User data */
+	uv_file fd;                 /* Resulting segment file descriptor */
+	unsigned long long counter; /* Resulting segment counter */
+	uvPrepareCb cb;             /* Completion callback */
+	queue queue;                /* Links in uv_io->prepare_reqs */
+};
+
+/* Get a prepared open segment ready for writing. If a prepared open segment is
+ * already available in the pool, it will be returned immediately using the fd
+ * and counter pointers and the request callback won't be invoked. Otherwise the
+ * request will be queued and its callback invoked once a newly prepared segment
+ * is available. */
+int UvPrepare(struct uv *uv,
+	      uv_file *fd,
+	      uvCounter *counter,
+	      struct uvPrepare *req,
+	      uvPrepareCb cb);
+
+/* Cancel all pending prepare requests and start removing all unused prepared
+ * open segments. If a segment currently being created, wait for it to complete
+ * and then remove it immediately. */
+void UvPrepareClose(struct uv *uv);
+
+/* Implementation of raft_io->append. All the raft_buffers of the raft_entry
+ * structs in the entries array are required to have a len that is a multiple
+ * of 8. */
+int UvAppend(struct raft_io *io,
+	     struct raft_io_append *req,
+	     const struct raft_entry entries[],
+	     unsigned n,
+	     raft_io_append_cb cb);
+
+/* Pause request object and callback. */
+struct UvBarrierReq;
+
+/* A barrier cb that plans to perform work on the threadpool MUST exit early
+ * and cleanup resources when it detects uv->closing, this is to allow forced
+ * closing on shutdown. */
+typedef void (*UvBarrierCb)(struct UvBarrierReq *req);
+struct UvBarrierReq
+{
+	bool blocking;  /* Whether this barrier should block future writes */
+	void *data;     /* User data */
+	UvBarrierCb cb; /* Completion callback */
+	queue queue;    /* Queue of reqs triggered by a UvBarrier */
+};
+
+struct UvBarrier
+{
+	bool blocking; /* Whether this barrier should block future writes */
+	queue reqs;    /* Queue of UvBarrierReq */
+};
+
+/* Submit a barrier request to interrupt the normal flow of append
+ * operations.
+ *
+ * The following will happen:
+ *
+ * - Replace uv->append_next_index with the given next_index, so the next entry
+ *   that will be appended will have the new index.
+ *
+ * - Execution of new writes for subsequent append requests will be blocked
+ *   until UvUnblock is called when the barrier is blocking.
+ *
+ * - Wait for all currently pending and inflight append requests against all
+ *   open segments to complete, and for those open segments to be finalized,
+ *   then invoke the barrier callback.
+ *
+ * This API is used to implement truncate and snapshot install operations, which
+ * need to wait until all pending writes have settled and modify the log state,
+ * changing the next index. */
+int UvBarrier(struct uv *uv, raft_index next_index, struct UvBarrierReq *req);
+
+/* Trigger a callback for a barrier request in this @barrier. Returns true if a
+ * callback was triggered, false if there are no more requests to trigger.
+ * A barrier callback will call UvUnblock, which in turn will try to run the
+ * next callback, if any, from a barrier request in this barrier. */
+bool UvBarrierMaybeTrigger(struct UvBarrier *barrier);
+
+/* Add a Barrier @req to an existing @barrier. */
+void UvBarrierAddReq(struct UvBarrier *barrier, struct UvBarrierReq *req);
+
+/* Returns @true if there are no more segments referencing uv->barrier */
+bool UvBarrierReady(struct uv *uv);
+
+/* Resume writing append requests after UvBarrier has been called. */
+void UvUnblock(struct uv *uv);
+
+/* Cancel all pending write requests and request the current segment to be
+ * finalized. Must be invoked at closing time. */
+void uvAppendClose(struct uv *uv);
+
+/* Submit a request to finalize the open segment with the given counter.
+ *
+ * Requests are processed one at a time, to avoid ending up closing open segment
+ * N + 1 before closing open segment N. */
+int UvFinalize(struct uv *uv,
+	       unsigned long long counter,
+	       size_t used,
+	       raft_index first_index,
+	       raft_index last_index);
+
+/* Implementation of raft_io->send. */
+int UvSend(struct raft_io *io,
+	   struct raft_io_send *req,
+	   const struct raft_message *message,
+	   raft_io_send_cb cb);
+
+/* Stop all clients by closing the outbound stream handles and canceling all
+ * pending send requests.  */
+void UvSendClose(struct uv *uv);
+
+/* Start receiving messages from new incoming connections. */
+int UvRecvStart(struct uv *uv);
+
+/* Stop all servers by closing the inbound stream handles and aborting all
+ * requests being received.  */
+void UvRecvClose(struct uv *uv);
+
+void uvMaybeFireCloseCb(struct uv *uv);
+
+#endif /* UV_H_ */
diff --git a/src/raft/uv_append.c b/src/raft/uv_append.c
new file mode 100644
index 000000000..9feb66ca5
--- /dev/null
+++ b/src/raft/uv_append.c
@@ -0,0 +1,1034 @@
+#include "assert.h"
+#include "byte.h"
+#include "heap.h"
+#include "queue.h"
+#include "uv.h"
+#include "uv_encoding.h"
+#include "uv_writer.h"
+
+/* The happy path for an append request is:
+ *
+ * - If there is a current segment and it is has enough spare capacity to hold
+ *   the entries in the request, then queue the request, linking it to the
+ *   current segment.
+ *
+ * - If there is no current segment, or it hasn't enough spare capacity to hold
+ *   the entries in the request, then request a new open segment to be prepared,
+ *   queue the request and link it to the newly requested segment.
+ *
+ * - Wait for any pending write against the current segment to complete, and
+ *   also for the prepare request if we asked for a new segment. Also wait for
+ *   any in progress barrier to be removed.
+ *
+ * - Submit a write request for the entries in this append request. The write
+ *   request might contain other append requests targeted to the current segment
+ *   that might have accumulated in the meantime, if we have been waiting for a
+ *   segment to be prepared, or for the previous write to complete or for a
+ *   barrier to be removed.
+ *
+ * - Wait for the write request to finish and fire the append request's
+ *   callback.
+ *
+ * Possible failure modes are:
+ *
+ * - The request to prepare a new segment fails.
+ * - The write request fails.
+ * - The request to finalize a new segment fails to be submitted.
+ *
+ * In all these cases we mark the instance as errored and fire the relevant
+ * callbacks.
+ **/
+
+/* An open segment being written or waiting to be written. */
+struct uvAliveSegment
+{
+	struct uv *uv;                  /* Our writer */
+	struct uvPrepare prepare;       /* Prepare segment file request */
+	struct UvWriter writer;         /* Writer to perform async I/O */
+	struct UvWriterReq write;       /* Write request */
+	unsigned long long counter;     /* Open segment counter */
+	raft_index first_index;         /* Index of the first entry written */
+	raft_index pending_last_index;  /* Index of the last entry written */
+	size_t size;                    /* Total number of bytes used */
+	unsigned next_block;            /* Next segment block to write */
+	struct uvSegmentBuffer pending; /* Buffer for data yet to be written */
+	uv_buf_t buf;                   /* Write buffer for current write */
+	raft_index last_index;          /* Last entry actually written */
+	size_t written;                 /* Number of bytes actually written */
+	queue queue;                    /* Segment queue */
+	struct UvBarrier *barrier;      /* Barrier waiting on this segment */
+	bool finalize;                  /* Finalize the segment after writing */
+};
+
+struct uvAppend
+{
+	struct raft_io_append *req;       /* User request */
+	const struct raft_entry *entries; /* Entries to write */
+	unsigned n;                       /* Number of entries */
+	struct uvAliveSegment *segment;   /* Segment to write to */
+	queue queue;
+};
+
+static void uvAliveSegmentWriterCloseCb(struct UvWriter *writer)
+{
+	struct uvAliveSegment *segment = writer->data;
+	struct uv *uv = segment->uv;
+	uvSegmentBufferClose(&segment->pending);
+	RaftHeapFree(segment);
+	uvMaybeFireCloseCb(uv);
+}
+
+/* Submit a request to close the current open segment. */
+static void uvAliveSegmentFinalize(struct uvAliveSegment *s)
+{
+	struct uv *uv = s->uv;
+	int rv;
+
+	rv = UvFinalize(uv, s->counter, s->written, s->first_index,
+			s->last_index);
+	if (rv != 0) {
+		uv->errored = true;
+		/* We failed to submit the finalize request, but let's still
+		 * close the file handle and release the segment memory. */
+	}
+
+	QUEUE_REMOVE(&s->queue);
+	UvWriterClose(&s->writer, uvAliveSegmentWriterCloseCb);
+}
+
+/* Flush the append requests in the given queue, firing their callbacks with the
+ * given status. */
+static void uvAppendFinishRequestsInQueue(struct uv *uv, queue *q, int status)
+{
+	queue queue_copy;
+	struct uvAppend *append;
+	QUEUE_INIT(&queue_copy);
+	while (!QUEUE_IS_EMPTY(q)) {
+		queue *head;
+		head = QUEUE_HEAD(q);
+		append = QUEUE_DATA(head, struct uvAppend, queue);
+		/* Rollback the append next index if the result was
+		 * unsuccessful. */
+		if (status != 0) {
+			tracef("rollback uv->append_next_index was:%llu",
+			       uv->append_next_index);
+			uv->append_next_index -= append->n;
+			tracef("rollback uv->append_next_index now:%llu",
+			       uv->append_next_index);
+		}
+		QUEUE_REMOVE(head);
+		QUEUE_PUSH(&queue_copy, head);
+	}
+	while (!QUEUE_IS_EMPTY(&queue_copy)) {
+		queue *head;
+		struct raft_io_append *req;
+		head = QUEUE_HEAD(&queue_copy);
+		append = QUEUE_DATA(head, struct uvAppend, queue);
+		QUEUE_REMOVE(head);
+		req = append->req;
+		RaftHeapFree(append);
+		req->cb(req, status);
+	}
+}
+
+/* Flush the append requests in the writing queue, firing their callbacks with
+ * the given status. */
+static void uvAppendFinishWritingRequests(struct uv *uv, int status)
+{
+	uvAppendFinishRequestsInQueue(uv, &uv->append_writing_reqs, status);
+}
+
+/* Flush the append requests in the pending queue, firing their callbacks with
+ * the given status. */
+static void uvAppendFinishPendingRequests(struct uv *uv, int status)
+{
+	uvAppendFinishRequestsInQueue(uv, &uv->append_pending_reqs, status);
+}
+
+/* Return the segment currently being written, or NULL when no segment has been
+ * written yet. */
+static struct uvAliveSegment *uvGetCurrentAliveSegment(struct uv *uv)
+{
+	queue *head;
+	if (QUEUE_IS_EMPTY(&uv->append_segments)) {
+		return NULL;
+	}
+	head = QUEUE_HEAD(&uv->append_segments);
+	return QUEUE_DATA(head, struct uvAliveSegment, queue);
+}
+
+/* Extend the segment's write buffer by encoding the entries in the given
+ * request into it. IOW, previous data in the write buffer will be retained, and
+ * data for these new entries will be appended. */
+static int uvAliveSegmentEncodeEntriesToWriteBuf(struct uvAliveSegment *segment,
+						 struct uvAppend *append)
+{
+	int rv;
+	assert(append->segment == segment);
+
+	/* If this is the very first write to the segment, we need to include
+	 * the format version */
+	if (segment->pending.n == 0 && segment->next_block == 0) {
+		rv = uvSegmentBufferFormat(&segment->pending);
+		if (rv != 0) {
+			return rv;
+		}
+	}
+
+	rv = uvSegmentBufferAppend(&segment->pending, append->entries,
+				   append->n);
+	if (rv != 0) {
+		return rv;
+	}
+
+	segment->pending_last_index += append->n;
+
+	return 0;
+}
+
+static int uvAppendMaybeStart(struct uv *uv);
+static void uvAliveSegmentWriteCb(struct UvWriterReq *write, const int status)
+{
+	struct uvAliveSegment *s = write->data;
+	struct uv *uv = s->uv;
+	unsigned n_blocks;
+	int rv;
+
+	assert(uv->state != UV__CLOSED);
+
+	assert(s->buf.len % uv->block_size == 0);
+	assert(s->buf.len >= uv->block_size);
+
+	/* Check if the write was successful. */
+	if (status != 0) {
+		tracef("write: %s", uv->io->errmsg);
+		uv->errored = true;
+		goto out;
+	}
+
+	s->written = s->next_block * uv->block_size + s->pending.n;
+	s->last_index = s->pending_last_index;
+
+	/* Update our write markers.
+	 *
+	 * We have four cases:
+	 *
+	 * - The data fit completely in the leftover space of the first block
+	 * that we wrote and there is more space left. In this case we just keep
+	 * the scheduled marker unchanged.
+	 *
+	 * - The data fit completely in the leftover space of the first block
+	 * that we wrote and there is no space left. In this case we advance the
+	 *   current block counter, reset the first write block and set the
+	 *   scheduled marker to 0.
+	 *
+	 * - The data did not fit completely in the leftover space of the first
+	 *   block that we wrote, so we wrote more than one block. The last
+	 * block that we wrote was not filled completely and has leftover space.
+	 * In this case we advance the current block counter and copy the memory
+	 * used for the last block to the head of the write arena list, updating
+	 * the scheduled marker accordingly.
+	 *
+	 * - The data did not fit completely in the leftover space of the first
+	 *   block that we wrote, so we wrote more than one block. The last
+	 * block that we wrote was filled exactly and has no leftover space. In
+	 * this case we advance the current block counter, reset the first
+	 * buffer and set the scheduled marker to 0.
+	 */
+	n_blocks = (unsigned)(s->buf.len /
+			      uv->block_size); /* Number of blocks written. */
+	if (s->pending.n < uv->block_size) {
+		/* Nothing to do */
+		assert(n_blocks == 1);
+	} else if (s->pending.n == uv->block_size) {
+		assert(n_blocks == 1);
+		s->next_block++;
+		uvSegmentBufferReset(&s->pending, 0);
+	} else {
+		assert(s->pending.n > uv->block_size);
+		assert(s->buf.len > uv->block_size);
+
+		if (s->pending.n % uv->block_size > 0) {
+			s->next_block += n_blocks - 1;
+			uvSegmentBufferReset(&s->pending, n_blocks - 1);
+		} else {
+			s->next_block += n_blocks;
+			uvSegmentBufferReset(&s->pending, 0);
+		}
+	}
+
+out:
+	/* Fire the callbacks of all requests that were fulfilled with this
+	 * write. */
+	uvAppendFinishWritingRequests(uv, status);
+	if (status != 0) {
+		/* When the write has failed additionally cancel all future
+		 * append related activity. This will also rewind
+		 * uv->append_next_index. All append requests need to be
+		 * canceled because raft assumes all appends happen in order and
+		 * if an append fails (and is not retried), we would be missing
+		 * a sequence of log entries on disk. The implementation can't
+		 * handle that + the accounting of the append index would be
+		 * off.
+		 */
+		uvAppendFinishPendingRequests(uv, status);
+		/* Allow this segment to be finalized further down. Don't bother
+		 * rewinding state to possibly reuse the segment for writing,
+		 * it's too bug-prone. */
+		s->pending_last_index = s->last_index;
+		s->finalize = true;
+	}
+
+	/* During the closing sequence we should have already canceled all
+	 * pending request. */
+	if (uv->closing) {
+		assert(QUEUE_IS_EMPTY(&uv->append_pending_reqs));
+		assert(s->finalize);
+		uvAliveSegmentFinalize(s);
+		return;
+	}
+
+	/* Possibly process waiting requests. */
+	if (!QUEUE_IS_EMPTY(&uv->append_pending_reqs)) {
+		rv = uvAppendMaybeStart(uv);
+		if (rv != 0) {
+			uv->errored = true;
+		}
+	} else if (s->finalize && (s->pending_last_index == s->last_index) &&
+		   !s->writer.closing) {
+		/* If there are no more append_pending_reqs or write requests in
+		 * flight, this segment must be finalized here in case we don't
+		 * receive AppendEntries RPCs anymore (could happen during a
+		 * Snapshot install, causing the BarrierCb to never fire), but
+		 * check that the callbacks that fired after completion of this
+		 * write didn't already close the segment. */
+		uvAliveSegmentFinalize(s);
+	}
+}
+
+/* Submit a file write request to append the entries encoded in the write buffer
+ * of the given segment. */
+static int uvAliveSegmentWrite(struct uvAliveSegment *s)
+{
+	int rv;
+	assert(s->counter != 0);
+	assert(s->pending.n > 0);
+	uvSegmentBufferFinalize(&s->pending, &s->buf);
+	rv = UvWriterSubmit(&s->writer, &s->write, &s->buf, 1,
+			    s->next_block * s->uv->block_size,
+			    uvAliveSegmentWriteCb);
+	if (rv != 0) {
+		return rv;
+	}
+	return 0;
+}
+
+/* Start writing all pending append requests for the current segment, unless we
+ * are already writing, or the segment itself has not yet been prepared or we
+ * are blocked on a barrier. If there are no more requests targeted at the
+ * current segment, make sure it's marked to be finalize and try with the next
+ * segment. */
+static int uvAppendMaybeStart(struct uv *uv)
+{
+	struct uvAliveSegment *segment;
+	struct uvAppend *append;
+	unsigned n_reqs;
+	queue *head;
+	queue q;
+	int rv;
+
+	assert(!uv->closing);
+	assert(!QUEUE_IS_EMPTY(&uv->append_pending_reqs));
+
+	/* If we are already writing, let's wait. */
+	if (!QUEUE_IS_EMPTY(&uv->append_writing_reqs)) {
+		return 0;
+	}
+
+start:
+	segment = uvGetCurrentAliveSegment(uv);
+	assert(segment != NULL);
+	/* If the preparer isn't done yet, let's wait. */
+	if (segment->counter == 0) {
+		return 0;
+	}
+
+	/* If there's a blocking barrier in progress, and it's not waiting for
+	 * this segment to be finalized, let's wait.
+	 *
+	 * FIXME shouldn't we wait even if segment->barrier == uv->barrier, if
+	 * there are other open segments associated with the same barrier? */
+	if (uv->barrier != NULL && segment->barrier != uv->barrier &&
+	    uv->barrier->blocking) {
+		return 0;
+	}
+
+	/* If there's no barrier in progress and this segment is marked with a
+	 * barrier, it means that this was a pending barrier, which we can
+	 * become the current barrier now. */
+	if (uv->barrier == NULL && segment->barrier != NULL) {
+		uv->barrier = segment->barrier;
+	}
+
+	/* Let's add to the segment's write buffer all pending requests targeted
+	 * to this segment. */
+	QUEUE_INIT(&q);
+
+	n_reqs = 0;
+	while (!QUEUE_IS_EMPTY(&uv->append_pending_reqs)) {
+		head = QUEUE_HEAD(&uv->append_pending_reqs);
+		append = QUEUE_DATA(head, struct uvAppend, queue);
+		assert(append->segment != NULL);
+		if (append->segment != segment) {
+			break; /* Not targeted to this segment */
+		}
+		QUEUE_REMOVE(head);
+		QUEUE_PUSH(&q, head);
+		n_reqs++;
+		rv = uvAliveSegmentEncodeEntriesToWriteBuf(segment, append);
+		if (rv != 0) {
+			goto err;
+		}
+	}
+
+	/* If we have no more requests for this segment, let's check if it has
+	 * been marked for closing, and in that case finalize it and possibly
+	 * trigger a write against the next segment (unless there is a truncate
+	 * request, in that case we need to wait for it). Otherwise it must mean
+	 * we have exhausted the queue of pending append requests. */
+	if (n_reqs == 0) {
+		assert(QUEUE_IS_EMPTY(&uv->append_writing_reqs));
+		if (segment->finalize) {
+			uvAliveSegmentFinalize(segment);
+			if (!QUEUE_IS_EMPTY(&uv->append_pending_reqs)) {
+				goto start;
+			}
+		}
+		assert(QUEUE_IS_EMPTY(&uv->append_pending_reqs));
+		return 0;
+	}
+
+	while (!QUEUE_IS_EMPTY(&q)) {
+		head = QUEUE_HEAD(&q);
+		QUEUE_REMOVE(head);
+		QUEUE_PUSH(&uv->append_writing_reqs, head);
+	}
+
+	rv = uvAliveSegmentWrite(segment);
+	if (rv != 0) {
+		goto err;
+	}
+
+	return 0;
+
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+/* Invoked when a newly added open segment becomes ready for writing, after the
+ * associated UvPrepare request completes (either synchronously or
+ * asynchronously). */
+static int uvAliveSegmentReady(struct uv *uv,
+			       uv_file fd,
+			       uvCounter counter,
+			       struct uvAliveSegment *segment)
+{
+	int rv;
+	rv = UvWriterInit(&segment->writer, uv->loop, fd, uv->direct_io,
+			  uv->async_io, 1, uv->io->errmsg);
+	if (rv != 0) {
+		ErrMsgWrapf(uv->io->errmsg, "setup writer for open-%llu",
+			    counter);
+		return rv;
+	}
+	segment->counter = counter;
+	return 0;
+}
+
+static void uvAliveSegmentPrepareCb(struct uvPrepare *req, int status)
+{
+	struct uvAliveSegment *segment = req->data;
+	struct uv *uv = segment->uv;
+	int rv;
+
+	assert(segment->counter == 0);
+	assert(segment->written == 0);
+
+	/* If we have been closed, let's discard the segment. */
+	if (uv->closing) {
+		QUEUE_REMOVE(&segment->queue);
+		assert(status ==
+		       RAFT_CANCELED); /* UvPrepare cancels pending reqs */
+		uvSegmentBufferClose(&segment->pending);
+		RaftHeapFree(segment);
+		return;
+	}
+
+	if (status != 0) {
+		tracef("prepare segment failed (%d)", status);
+		rv = status;
+		goto err;
+	}
+
+	assert(req->counter > 0);
+	assert(req->fd >= 0);
+
+	/* There must be pending appends that were waiting for this prepare
+	 * requests. */
+	assert(!QUEUE_IS_EMPTY(&uv->append_pending_reqs));
+
+	rv = uvAliveSegmentReady(uv, req->fd, req->counter, segment);
+	if (rv != 0) {
+		tracef("prepare segment ready failed (%d)", rv);
+		goto err;
+	}
+
+	rv = uvAppendMaybeStart(uv);
+	if (rv != 0) {
+		tracef("prepare segment start failed (%d)", rv);
+		goto err;
+	}
+
+	return;
+
+err:
+	QUEUE_REMOVE(&segment->queue);
+	RaftHeapFree(segment);
+	uv->errored = true;
+	uvAppendFinishPendingRequests(uv, rv);
+}
+
+/* Initialize a new open segment object. */
+static void uvAliveSegmentInit(struct uvAliveSegment *s, struct uv *uv)
+{
+	s->uv = uv;
+	s->prepare.data = s;
+	s->writer.data = s;
+	s->write.data = s;
+	s->counter = 0;
+	s->first_index = uv->append_next_index;
+	s->pending_last_index = s->first_index - 1;
+	s->last_index = 0;
+	s->size = sizeof(uint64_t) /* Format version */;
+	s->next_block = 0;
+	uvSegmentBufferInit(&s->pending, uv->block_size);
+	s->written = 0;
+	s->barrier = NULL;
+	s->finalize = false;
+}
+
+/* Add a new active open segment, since the append request being submitted does
+ * not fit in the last segment we scheduled writes for, or no segment had been
+ * previously requested at all. */
+static int uvAppendPushAliveSegment(struct uv *uv)
+{
+	struct uvAliveSegment *segment;
+	uv_file fd;
+	uvCounter counter;
+	int rv;
+
+	segment = RaftHeapMalloc(sizeof *segment);
+	if (segment == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+	uvAliveSegmentInit(segment, uv);
+
+	QUEUE_PUSH(&uv->append_segments, &segment->queue);
+
+	rv = UvPrepare(uv, &fd, &counter, &segment->prepare,
+		       uvAliveSegmentPrepareCb);
+	if (rv != 0) {
+		goto err_after_alloc;
+	}
+
+	/* If we've been returned a ready prepared segment right away, start
+	 * writing to it immediately. */
+	if (fd != -1) {
+		rv = uvAliveSegmentReady(uv, fd, counter, segment);
+		if (rv != 0) {
+			goto err_after_prepare;
+		}
+	}
+	return 0;
+
+err_after_prepare:
+	UvOsClose(fd);
+	UvFinalize(uv, counter, 0, 0, 0);
+err_after_alloc:
+	QUEUE_REMOVE(&segment->queue);
+	RaftHeapFree(segment);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+/* Return the last segment that we have requested to prepare. */
+static struct uvAliveSegment *uvGetLastAliveSegment(struct uv *uv)
+{
+	queue *tail;
+	if (QUEUE_IS_EMPTY(&uv->append_segments)) {
+		return NULL;
+	}
+	tail = QUEUE_TAIL(&uv->append_segments);
+	return QUEUE_DATA(tail, struct uvAliveSegment, queue);
+}
+
+/* Return #true if the remaining capacity of the given segment is equal or
+ * greater than @size. */
+static bool uvAliveSegmentHasEnoughSpareCapacity(struct uvAliveSegment *s,
+						 size_t size)
+{
+	return s->size + size <= s->uv->segment_size;
+}
+
+/* Add @size bytes to the number of bytes that the segment will hold. The actual
+ * write will happen when the previous write completes, if any. */
+static void uvAliveSegmentReserveSegmentCapacity(struct uvAliveSegment *s,
+						 size_t size)
+{
+	s->size += size;
+}
+
+/* Return the number of bytes needed to store the batch of entries of this
+ * append request on disk. */
+static size_t uvAppendSize(struct uvAppend *a)
+{
+	size_t size = sizeof(uint32_t) * 2; /* CRC checksums */
+	unsigned i;
+	size += uvSizeofBatchHeader(a->n); /* Batch header */
+	for (i = 0; i < a->n; i++) {       /* Entries data */
+		size += bytePad64(a->entries[i].buf.len);
+	}
+	return size;
+}
+
+/* Enqueue an append entries request, assigning it to the appropriate active
+ * open segment. */
+static int uvAppendEnqueueRequest(struct uv *uv, struct uvAppend *append)
+{
+	struct uvAliveSegment *segment;
+	size_t size;
+	bool fits;
+	int rv;
+
+	assert(append->entries != NULL);
+	assert(append->n > 0);
+	assert(uv->append_next_index > 0);
+	tracef("enqueue %u entries", append->n);
+
+	size = uvAppendSize(append);
+
+	/* If we have no segments yet, it means this is the very first append,
+	 * and we need to add a new segment. Otherwise we check if the last
+	 * segment has enough room for this batch of entries. */
+	segment = uvGetLastAliveSegment(uv);
+	if (segment == NULL || segment->finalize) {
+		fits = false;
+	} else {
+		fits = uvAliveSegmentHasEnoughSpareCapacity(segment, size);
+		if (!fits) {
+			segment->finalize =
+			    true; /* Finalize when all writes are done */
+		}
+	}
+
+	/* If there's no segment or if this batch does not fit in this segment,
+	 * we need to add a new one. */
+	if (!fits) {
+		rv = uvAppendPushAliveSegment(uv);
+		if (rv != 0) {
+			goto err;
+		}
+	}
+
+	segment = uvGetLastAliveSegment(uv); /* Get the last added segment */
+	uvAliveSegmentReserveSegmentCapacity(segment, size);
+
+	append->segment = segment;
+	QUEUE_PUSH(&uv->append_pending_reqs, &append->queue);
+	uv->append_next_index += append->n;
+	tracef("set uv->append_next_index %llu", uv->append_next_index);
+
+	return 0;
+
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+/* Check that all entry buffers are 8-byte aligned */
+static int uvCheckEntryBuffersAligned(struct uv *uv,
+				      const struct raft_entry entries[],
+				      unsigned n)
+{
+	unsigned i;
+
+	for (i = 0; i < n; i++) {
+		if (entries[i].buf.len % 8) {
+			ErrMsgPrintf(uv->io->errmsg,
+				     "entry buffers must be 8-byte aligned");
+			tracef("%s", uv->io->errmsg);
+			return RAFT_INVALID;
+		}
+	}
+
+	return 0;
+}
+
+int UvAppend(struct raft_io *io,
+	     struct raft_io_append *req,
+	     const struct raft_entry entries[],
+	     unsigned n,
+	     raft_io_append_cb cb)
+{
+	struct uv *uv;
+	struct uvAppend *append;
+	int rv;
+
+	uv = io->impl;
+	assert(!uv->closing);
+
+	append = RaftHeapCalloc(1, sizeof *append);
+	if (append == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+	append->req = req;
+	append->entries = entries;
+	append->n = n;
+	req->cb = cb;
+
+	rv = uvCheckEntryBuffersAligned(uv, entries, n);
+	if (rv != 0) {
+		goto err_after_req_alloc;
+	}
+
+	rv = uvAppendEnqueueRequest(uv, append);
+	if (rv != 0) {
+		goto err_after_req_alloc;
+	}
+
+	assert(append->segment != NULL);
+	assert(!QUEUE_IS_EMPTY(&uv->append_pending_reqs));
+
+	/* Try to write immediately. */
+	rv = uvAppendMaybeStart(uv);
+	if (rv != 0) {
+		return rv;
+	}
+
+	return 0;
+
+err_after_req_alloc:
+	RaftHeapFree(append);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+/* Finalize the current segment as soon as all its pending or inflight append
+ * requests get completed. */
+static void uvFinalizeCurrentAliveSegmentOnceIdle(struct uv *uv)
+{
+	struct uvAliveSegment *s;
+	queue *head;
+	bool has_pending_reqs;
+	bool has_writing_reqs;
+
+	s = uvGetCurrentAliveSegment(uv);
+	if (s == NULL) {
+		return;
+	}
+
+	/* Check if there are pending append requests targeted to the current
+	 * segment. */
+	has_pending_reqs = false;
+	QUEUE_FOREACH(head, &uv->append_pending_reqs)
+	{
+		struct uvAppend *r = QUEUE_DATA(head, struct uvAppend, queue);
+		if (r->segment == s) {
+			has_pending_reqs = true;
+			break;
+		}
+	}
+	has_writing_reqs = !QUEUE_IS_EMPTY(&uv->append_writing_reqs);
+
+	/* If there is no pending append request or inflight write against the
+	 * current segment, we can submit a request for it to be closed
+	 * immediately. Otherwise, we set the finalize flag.
+	 *
+	 * TODO: is it actually possible to have pending requests with no
+	 * writing requests? Probably no. */
+	if (!has_pending_reqs && !has_writing_reqs) {
+		uvAliveSegmentFinalize(s);
+	} else {
+		s->finalize = true;
+	}
+}
+
+bool UvBarrierReady(struct uv *uv)
+{
+	if (uv->barrier == NULL) {
+		return true;
+	}
+
+	queue *head;
+	QUEUE_FOREACH(head, &uv->append_segments)
+	{
+		struct uvAliveSegment *segment;
+		segment = QUEUE_DATA(head, struct uvAliveSegment, queue);
+		if (segment->barrier == uv->barrier) {
+			return false;
+		}
+	}
+	return true;
+}
+
+bool UvBarrierMaybeTrigger(struct UvBarrier *barrier)
+{
+	if (!barrier) {
+		return false;
+	}
+
+	if (!QUEUE_IS_EMPTY(&barrier->reqs)) {
+		queue *head;
+		struct UvBarrierReq *r;
+		head = QUEUE_HEAD(&barrier->reqs);
+		QUEUE_REMOVE(head);
+		r = QUEUE_DATA(head, struct UvBarrierReq, queue);
+		r->cb(r);
+		return true;
+	}
+
+	return false;
+}
+
+/* Used during cleanup. */
+static void uvBarrierTriggerAll(struct UvBarrier *barrier)
+{
+	while (UvBarrierMaybeTrigger(barrier)) {
+		;
+	}
+}
+
+static struct UvBarrier *uvBarrierCreate(void)
+{
+	struct UvBarrier *barrier;
+	barrier = RaftHeapCalloc(1, sizeof(*barrier));
+	if (!barrier) {
+		return NULL;
+	}
+	barrier->blocking = false;
+	QUEUE_INIT(&barrier->reqs);
+	return barrier;
+}
+
+int UvBarrier(struct uv *uv, raft_index next_index, struct UvBarrierReq *req)
+{
+	/* The barrier to attach to. */
+	struct UvBarrier *barrier = NULL;
+	struct uvAliveSegment *segment = NULL;
+	queue *head;
+
+	assert(!uv->closing);
+
+	/* The next entry will be appended at this index. */
+	uv->append_next_index = next_index;
+	tracef("UvBarrier uv->append_next_index:%llu", uv->append_next_index);
+
+	/* Arrange for all open segments not already involved in other barriers
+	 * to be finalized as soon as their append requests get completed and
+	 * mark them as involved in this specific barrier request. */
+	QUEUE_FOREACH(head, &uv->append_segments)
+	{
+		segment = QUEUE_DATA(head, struct uvAliveSegment, queue);
+		if (segment->barrier != NULL) {
+			/* If a non-blocking barrier precedes this blocking
+			 * request, we want to also block all future writes. */
+			if (req->blocking) {
+				segment->barrier->blocking = true;
+			}
+			continue;
+		}
+
+		if (!barrier) {
+			barrier = uvBarrierCreate();
+			if (!barrier) {
+				return RAFT_NOMEM;
+			}
+			/* And add the request to the barrier. */
+			UvBarrierAddReq(barrier, req);
+		}
+		segment->barrier = barrier;
+
+		if (segment == uvGetCurrentAliveSegment(uv)) {
+			uvFinalizeCurrentAliveSegmentOnceIdle(uv);
+			continue;
+		}
+		segment->finalize = true;
+	}
+
+	/* Unable to attach to a segment, because all segments are involved in a
+	 * barrier, or there are no segments. */
+	if (barrier == NULL) {
+		/* Attach req to last segment barrier. */
+		if (segment != NULL) {
+			barrier = segment->barrier;
+			/* There is no segment, attach to uv->barrier. */
+		} else if (uv->barrier != NULL) {
+			barrier = uv->barrier;
+			/* There is no uv->barrier, make new one. */
+		} else {
+			barrier = uvBarrierCreate();
+			if (!barrier) {
+				return RAFT_NOMEM;
+			}
+		}
+		UvBarrierAddReq(barrier, req);
+	}
+
+	/* Let's not continue writing new entries if something down the line
+	 * asked us to stop writing. */
+	if (uv->barrier != NULL && req->blocking) {
+		uv->barrier->blocking = true;
+	}
+
+	assert(barrier != NULL);
+	if (uv->barrier == NULL) {
+		uv->barrier = barrier;
+		/* If there's no pending append-related activity, we can fire
+		 * the callback immediately.
+		 *
+		 * TODO: find a way to avoid invoking this synchronously. */
+		if (QUEUE_IS_EMPTY(&uv->append_segments) &&
+		    QUEUE_IS_EMPTY(&uv->finalize_reqs) &&
+		    uv->finalize_work.data == NULL) {
+			/* Not interested in return value. */
+			UvBarrierMaybeTrigger(barrier);
+		}
+	}
+
+	return 0;
+}
+
+void UvUnblock(struct uv *uv)
+{
+	/* First fire all pending barrier requests. Unblock will be called again
+	 * when that request's callback is fired.  */
+	if (UvBarrierMaybeTrigger(uv->barrier)) {
+		tracef("UvUnblock triggered barrier request callback.");
+		return;
+	}
+
+	/* All requests in barrier are finished. */
+	tracef("UvUnblock queue empty");
+	RaftHeapFree(uv->barrier);
+	uv->barrier = NULL;
+	if (uv->closing) {
+		uvMaybeFireCloseCb(uv);
+		return;
+	}
+	if (!QUEUE_IS_EMPTY(&uv->append_pending_reqs)) {
+		int rv;
+		rv = uvAppendMaybeStart(uv);
+		if (rv != 0) {
+			uv->errored = true;
+		}
+	}
+}
+
+void UvBarrierAddReq(struct UvBarrier *barrier, struct UvBarrierReq *req)
+{
+	assert(barrier != NULL);
+	assert(req != NULL);
+	/* Once there's a blocking req, this barrier becomes blocking. */
+	barrier->blocking |= req->blocking;
+	QUEUE_PUSH(&barrier->reqs, &req->queue);
+}
+
+/* Fire all pending barrier requests, the barrier callback will notice that
+ * we're closing and abort there. */
+static void uvBarrierClose(struct uv *uv)
+{
+	tracef("uv barrier close");
+	struct UvBarrier *barrier = NULL;
+	queue *head;
+	assert(uv->closing);
+	QUEUE_FOREACH(head, &uv->append_segments)
+	{
+		struct uvAliveSegment *segment;
+		segment = QUEUE_DATA(head, struct uvAliveSegment, queue);
+		if (segment->barrier != NULL && segment->barrier != barrier &&
+		    segment->barrier != uv->barrier) {
+			barrier = segment->barrier;
+			/* Fire all barrier cb's, this is safe because the
+			 * barrier cb exits early when uv->closing is true. */
+			uvBarrierTriggerAll(barrier);
+			RaftHeapFree(barrier);
+		}
+		/* The segment->barrier field is used:
+		 *
+		 * - by UvBarrierReady, to check whether it's time to invoke the
+		 * barrier callback after successfully finalizing a segment
+		 * - by uvAppendMaybeStart, to see whether we should go ahead
+		 * with writing to a segment even though a barrier is active
+		 * because the barrier is waiting on that same segment to be
+		 * finalized (but see the
+		 * FIXME in that function)
+		 * - to save a barrier for later, if UvBarrier was called when
+		 * uv->barrier was already set
+		 *
+		 * If we're cancelling the barrier, we don't need to save it for
+		 * later; the callback will not be invoked a second time in any
+		 * case; and uvAppendMaybeStart won't be called while closing.
+		 * So it's fine to clear segment->barrier here. */
+		segment->barrier = NULL;
+	}
+
+	/* There might still be a current barrier set on uv->barrier, meaning
+	 * that the open segment it was associated with has started to be
+	 * finalized and is not anymore in the append_segments queue. Let's
+	 * cancel all untriggered barrier request callbacks too. */
+	if (uv->barrier != NULL) {
+		uvBarrierTriggerAll(uv->barrier);
+		/* Clear uv->barrier if there's no active work on the thread
+		 * pool. When the work on the threadpool finishes, UvUnblock
+		 * will notice we're closing, clear and free uv->barrier and
+		 * call uvMaybeFireCloseCb. UnUnblock will not try to fire
+		 * anymore barrier request callbacks because they were triggered
+		 * in the line above. */
+		if (uv->snapshot_put_work.data == NULL &&
+		    uv->truncate_work.data == NULL) {
+			RaftHeapFree(uv->barrier);
+			uv->barrier = NULL;
+		}
+	}
+}
+
+void uvAppendClose(struct uv *uv)
+{
+	struct uvAliveSegment *segment;
+	assert(uv->closing);
+
+	uvBarrierClose(uv);
+	UvPrepareClose(uv);
+
+	uvAppendFinishPendingRequests(uv, RAFT_CANCELED);
+
+	uvFinalizeCurrentAliveSegmentOnceIdle(uv);
+
+	/* Also finalize the segments that we didn't write at all and are just
+	 * sitting in the append_segments queue waiting for writes against the
+	 * current segment to complete. */
+	while (!QUEUE_IS_EMPTY(&uv->append_segments)) {
+		segment = uvGetLastAliveSegment(uv);
+		assert(segment != NULL);
+		if (segment == uvGetCurrentAliveSegment(uv)) {
+			break; /* We reached the head of the queue */
+		}
+		assert(segment->written == 0);
+		uvAliveSegmentFinalize(segment);
+	}
+}
diff --git a/src/raft/uv_encoding.c b/src/raft/uv_encoding.c
new file mode 100644
index 000000000..085192e83
--- /dev/null
+++ b/src/raft/uv_encoding.c
@@ -0,0 +1,581 @@
+#include "uv_encoding.h"
+
+#include <limits.h>
+#include <string.h>
+
+#include "../raft.h"
+#include "assert.h"
+#include "byte.h"
+#include "configuration.h"
+
+/**
+ * Size of the request preamble.
+ */
+#define RAFT_IO_UV__PREAMBLE_SIZE               \
+	(sizeof(uint64_t) /* Message type. */ + \
+	 sizeof(uint64_t) /* Message size. */)
+
+static size_t sizeofRequestVoteV1(void)
+{
+	return sizeof(uint64_t) + /* Term. */
+	       sizeof(uint64_t) + /* Candidate ID. */
+	       sizeof(uint64_t) + /* Last log index. */
+	       sizeof(uint64_t) /* Last log term. */;
+}
+
+static size_t sizeofRequestVote(void)
+{
+	return sizeofRequestVoteV1() +
+	       sizeof(uint64_t) /* Leadership transfer. */;
+}
+
+static size_t sizeofRequestVoteResultV1(void)
+{
+	return sizeof(uint64_t) + /* Term. */
+	       sizeof(uint64_t) /* Vote granted. */;
+}
+
+static size_t sizeofRequestVoteResult(void)
+{
+	return sizeofRequestVoteResultV1() + /* Size of older version 1 message
+					      */
+	       sizeof(uint64_t) /* Flags. */;
+}
+
+static size_t sizeofAppendEntries(const struct raft_append_entries *p)
+{
+	return sizeof(uint64_t) + /* Leader's term. */
+	       sizeof(uint64_t) + /* Leader ID */
+	       sizeof(uint64_t) + /* Previous log entry index */
+	       sizeof(uint64_t) + /* Previous log entry term */
+	       sizeof(uint64_t) + /* Leader's commit index */
+	       sizeof(uint64_t) + /* Number of entries in the batch */
+	       16 * p->n_entries /* One header per entry */;
+}
+
+static size_t sizeofAppendEntriesResultV0(void)
+{
+	return sizeof(uint64_t) + /* Term. */
+	       sizeof(uint64_t) + /* Success. */
+	       sizeof(uint64_t) /* Last log index. */;
+}
+
+static size_t sizeofAppendEntriesResult(void)
+{
+	return sizeofAppendEntriesResultV0() +
+	       sizeof(uint64_t) /* 64 bit Flags. */;
+}
+
+static size_t sizeofInstallSnapshot(const struct raft_install_snapshot *p)
+{
+	size_t conf_size = configurationEncodedSize(&p->conf);
+	return sizeof(uint64_t) + /* Leader's term. */
+	       sizeof(uint64_t) + /* Leader ID */
+	       sizeof(uint64_t) + /* Snapshot's last index */
+	       sizeof(uint64_t) + /* Term of last index */
+	       sizeof(uint64_t) + /* Configuration's index */
+	       sizeof(uint64_t) + /* Length of configuration */
+	       conf_size +        /* Configuration data */
+	       sizeof(uint64_t);  /* Length of snapshot data */
+}
+
+static size_t sizeofTimeoutNow(void)
+{
+	return sizeof(uint64_t) + /* Term. */
+	       sizeof(uint64_t) + /* Last log index. */
+	       sizeof(uint64_t) /* Last log term. */;
+}
+
+size_t uvSizeofBatchHeader(size_t n)
+{
+	return 8 + /* Number of entries in the batch, little endian */
+	       16 * n /* One header per entry */;
+}
+
+static void encodeRequestVote(const struct raft_request_vote *p, void *buf)
+{
+	void *cursor = buf;
+	uint64_t flags = 0;
+
+	if (p->disrupt_leader) {
+		flags |= 1 << 0;
+	}
+	if (p->pre_vote) {
+		flags |= 1 << 1;
+	}
+
+	bytePut64(&cursor, p->term);
+	bytePut64(&cursor, p->candidate_id);
+	bytePut64(&cursor, p->last_log_index);
+	bytePut64(&cursor, p->last_log_term);
+	bytePut64(&cursor, flags);
+}
+
+static void encodeRequestVoteResult(const struct raft_request_vote_result *p,
+				    void *buf)
+{
+	void *cursor = buf;
+	uint64_t flags = 0;
+
+	if (p->pre_vote) {
+		flags |= (1 << 0);
+	}
+
+	bytePut64(&cursor, p->term);
+	bytePut64(&cursor, p->vote_granted);
+	bytePut64(&cursor, flags);
+}
+
+static void encodeAppendEntries(const struct raft_append_entries *p, void *buf)
+{
+	void *cursor;
+
+	cursor = buf;
+
+	bytePut64(&cursor, p->term);           /* Leader's term. */
+	bytePut64(&cursor, p->prev_log_index); /* Previous index. */
+	bytePut64(&cursor, p->prev_log_term);  /* Previous term. */
+	bytePut64(&cursor, p->leader_commit);  /* Commit index. */
+
+	uvEncodeBatchHeader(p->entries, p->n_entries, cursor);
+}
+
+static void encodeAppendEntriesResult(
+    const struct raft_append_entries_result *p,
+    void *buf)
+{
+	void *cursor = buf;
+
+	bytePut64(&cursor, p->term);
+	bytePut64(&cursor, p->rejected);
+	bytePut64(&cursor, p->last_log_index);
+	bytePut64(&cursor, p->features);
+}
+
+static void encodeInstallSnapshot(const struct raft_install_snapshot *p,
+				  void *buf)
+{
+	void *cursor;
+	size_t conf_size = configurationEncodedSize(&p->conf);
+
+	cursor = buf;
+
+	bytePut64(&cursor, p->term);       /* Leader's term. */
+	bytePut64(&cursor, p->last_index); /* Snapshot last index. */
+	bytePut64(&cursor, p->last_term);  /* Term of last index. */
+	bytePut64(&cursor, p->conf_index); /* Configuration index. */
+	bytePut64(&cursor, conf_size);     /* Configuration length. */
+	configurationEncodeToBuf(&p->conf, cursor);
+	cursor = (uint8_t *)cursor + conf_size;
+	bytePut64(&cursor, p->data.len); /* Snapshot data size. */
+}
+
+static void encodeTimeoutNow(const struct raft_timeout_now *p, void *buf)
+{
+	void *cursor = buf;
+
+	bytePut64(&cursor, p->term);
+	bytePut64(&cursor, p->last_log_index);
+	bytePut64(&cursor, p->last_log_term);
+}
+
+int uvEncodeMessage(const struct raft_message *message,
+		    uv_buf_t **bufs,
+		    unsigned *n_bufs)
+{
+	uv_buf_t header;
+	void *cursor;
+
+	/* Figure out the length of the header for this request and allocate a
+	 * buffer for it. */
+	header.len = RAFT_IO_UV__PREAMBLE_SIZE;
+	switch (message->type) {
+		case RAFT_IO_REQUEST_VOTE:
+			header.len += sizeofRequestVote();
+			break;
+		case RAFT_IO_REQUEST_VOTE_RESULT:
+			header.len += sizeofRequestVoteResult();
+			break;
+		case RAFT_IO_APPEND_ENTRIES:
+			header.len +=
+			    sizeofAppendEntries(&message->append_entries);
+			break;
+		case RAFT_IO_APPEND_ENTRIES_RESULT:
+			header.len += sizeofAppendEntriesResult();
+			break;
+		case RAFT_IO_INSTALL_SNAPSHOT:
+			header.len +=
+			    sizeofInstallSnapshot(&message->install_snapshot);
+			break;
+		case RAFT_IO_TIMEOUT_NOW:
+			header.len += sizeofTimeoutNow();
+			break;
+		default:
+			return RAFT_MALFORMED;
+	};
+
+	header.base = raft_malloc(header.len);
+	if (header.base == NULL) {
+		goto oom;
+	}
+
+	cursor = header.base;
+
+	/* Encode the request preamble, with message type and message size. */
+	bytePut64(&cursor, message->type);
+	bytePut64(&cursor, header.len - RAFT_IO_UV__PREAMBLE_SIZE);
+
+	/* Encode the request header. */
+	switch (message->type) {
+		case RAFT_IO_REQUEST_VOTE:
+			encodeRequestVote(&message->request_vote, cursor);
+			break;
+		case RAFT_IO_REQUEST_VOTE_RESULT:
+			encodeRequestVoteResult(&message->request_vote_result,
+						cursor);
+			break;
+		case RAFT_IO_APPEND_ENTRIES:
+			encodeAppendEntries(&message->append_entries, cursor);
+			break;
+		case RAFT_IO_APPEND_ENTRIES_RESULT:
+			encodeAppendEntriesResult(
+			    &message->append_entries_result, cursor);
+			break;
+		case RAFT_IO_INSTALL_SNAPSHOT:
+			encodeInstallSnapshot(&message->install_snapshot,
+					      cursor);
+			break;
+		case RAFT_IO_TIMEOUT_NOW:
+			encodeTimeoutNow(&message->timeout_now, cursor);
+			break;
+	};
+
+	*n_bufs = 1;
+
+	/* For AppendEntries request we also send the entries payload. */
+	if (message->type == RAFT_IO_APPEND_ENTRIES) {
+		*n_bufs += message->append_entries.n_entries;
+	}
+
+	/* For InstallSnapshot request we also send the snapshot payload. */
+	if (message->type == RAFT_IO_INSTALL_SNAPSHOT) {
+		*n_bufs += 1;
+	}
+
+	*bufs = raft_calloc(*n_bufs, sizeof **bufs);
+	if (*bufs == NULL) {
+		goto oom_after_header_alloc;
+	}
+
+	(*bufs)[0] = header;
+
+	if (message->type == RAFT_IO_APPEND_ENTRIES) {
+		unsigned i;
+		for (i = 0; i < message->append_entries.n_entries; i++) {
+			const struct raft_entry *entry =
+			    &message->append_entries.entries[i];
+			(*bufs)[i + 1].base = entry->buf.base;
+			(*bufs)[i + 1].len = entry->buf.len;
+		}
+	}
+
+	if (message->type == RAFT_IO_INSTALL_SNAPSHOT) {
+		(*bufs)[1].base = message->install_snapshot.data.base;
+		(*bufs)[1].len = message->install_snapshot.data.len;
+	}
+
+	return 0;
+
+oom_after_header_alloc:
+	raft_free(header.base);
+
+oom:
+	return RAFT_NOMEM;
+}
+
+void uvEncodeBatchHeader(const struct raft_entry *entries,
+			 unsigned n,
+			 void *buf)
+{
+	unsigned i;
+	void *cursor = buf;
+
+	/* Number of entries in the batch, little endian */
+	bytePut64(&cursor, n);
+
+	for (i = 0; i < n; i++) {
+		const struct raft_entry *entry = &entries[i];
+
+		/* Term in which the entry was created, little endian. */
+		bytePut64(&cursor, entry->term);
+
+		/* Message type (Either RAFT_COMMAND or RAFT_CHANGE) */
+		bytePut8(&cursor, (uint8_t)entry->type);
+
+		cursor = (uint8_t *)cursor + 3; /* Unused */
+
+		/* Size of the log entry data, little endian. */
+		bytePut32(&cursor, (uint32_t)entry->buf.len);
+	}
+}
+
+static void decodeRequestVote(const uv_buf_t *buf, struct raft_request_vote *p)
+{
+	const void *cursor;
+
+	cursor = buf->base;
+
+	p->version = 1;
+	p->term = byteGet64(&cursor);
+	p->candidate_id = byteGet64(&cursor);
+	p->last_log_index = byteGet64(&cursor);
+	p->last_log_term = byteGet64(&cursor);
+
+	/* Support for legacy request vote that doesn't have disrupt_leader. */
+	if (buf->len == sizeofRequestVoteV1()) {
+		p->disrupt_leader = false;
+		p->pre_vote = false;
+	} else {
+		p->version = 2;
+		uint64_t flags = byteGet64(&cursor);
+		p->disrupt_leader = (bool)(flags & 1 << 0);
+		p->pre_vote = (bool)(flags & 1 << 1);
+	}
+}
+
+static void decodeRequestVoteResult(const uv_buf_t *buf,
+				    struct raft_request_vote_result *p)
+{
+	const void *cursor;
+
+	cursor = buf->base;
+
+	p->version = 1;
+	p->term = byteGet64(&cursor);
+	p->vote_granted = byteGet64(&cursor);
+
+	if (buf->len > sizeofRequestVoteResultV1()) {
+		p->version = 2;
+		uint64_t flags = byteGet64(&cursor);
+		p->pre_vote = (flags & (1 << 0));
+	}
+}
+
+int uvDecodeBatchHeader(const void *batch,
+			struct raft_entry **entries,
+			unsigned *n)
+{
+	const void *cursor = batch;
+	size_t i;
+	int rv;
+
+	*n = (unsigned)byteGet64(&cursor);
+
+	if (*n == 0) {
+		*entries = NULL;
+		return 0;
+	}
+
+	*entries = raft_malloc(*n * sizeof **entries);
+
+	if (*entries == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < *n; i++) {
+		struct raft_entry *entry = &(*entries)[i];
+
+		entry->term = byteGet64(&cursor);
+		entry->type = byteGet8(&cursor);
+
+		if (entry->type != RAFT_COMMAND &&
+		    entry->type != RAFT_BARRIER && entry->type != RAFT_CHANGE) {
+			rv = RAFT_MALFORMED;
+			goto err_after_alloc;
+		}
+
+		cursor = (uint8_t *)cursor + 3; /* Unused */
+
+		/* Size of the log entry data, little endian. */
+		entry->buf.len = byteGet32(&cursor);
+	}
+
+	return 0;
+
+err_after_alloc:
+	raft_free(*entries);
+	*entries = NULL;
+
+err:
+	assert(rv != 0);
+
+	return rv;
+}
+
+static int decodeAppendEntries(const uv_buf_t *buf,
+			       struct raft_append_entries *args)
+{
+	const void *cursor;
+	int rv;
+
+	assert(buf != NULL);
+	assert(args != NULL);
+
+	cursor = buf->base;
+
+	args->version = 0;
+	args->term = byteGet64(&cursor);
+	args->prev_log_index = byteGet64(&cursor);
+	args->prev_log_term = byteGet64(&cursor);
+	args->leader_commit = byteGet64(&cursor);
+
+	rv = uvDecodeBatchHeader(cursor, &args->entries, &args->n_entries);
+	if (rv != 0) {
+		return rv;
+	}
+
+	return 0;
+}
+
+static void decodeAppendEntriesResult(const uv_buf_t *buf,
+				      struct raft_append_entries_result *p)
+{
+	const void *cursor;
+
+	cursor = buf->base;
+
+	p->version = 0;
+	p->term = byteGet64(&cursor);
+	p->rejected = byteGet64(&cursor);
+	p->last_log_index = byteGet64(&cursor);
+	p->features = 0;
+	if (buf->len > sizeofAppendEntriesResultV0()) {
+		p->version = 1;
+		p->features = byteGet64(&cursor);
+	}
+}
+
+static int decodeInstallSnapshot(const uv_buf_t *buf,
+				 struct raft_install_snapshot *args)
+{
+	const void *cursor;
+	struct raft_buffer conf;
+	int rv;
+
+	assert(buf != NULL);
+	assert(args != NULL);
+
+	cursor = buf->base;
+
+	args->version = 0;
+	args->term = byteGet64(&cursor);
+	args->last_index = byteGet64(&cursor);
+	args->last_term = byteGet64(&cursor);
+	args->conf_index = byteGet64(&cursor);
+	conf.len = (size_t)byteGet64(&cursor);
+	conf.base = (void *)cursor;
+
+	rv = configurationDecode(&conf, &args->conf);
+	if (rv != 0) {
+		return rv;
+	}
+	cursor = (uint8_t *)cursor + conf.len;
+	args->data.len = (size_t)byteGet64(&cursor);
+
+	return 0;
+}
+
+static void decodeTimeoutNow(const uv_buf_t *buf, struct raft_timeout_now *p)
+{
+	const void *cursor;
+
+	cursor = buf->base;
+
+	p->version = 0;
+	p->term = byteGet64(&cursor);
+	p->last_log_index = byteGet64(&cursor);
+	p->last_log_term = byteGet64(&cursor);
+}
+
+int uvDecodeMessage(uint16_t type,
+		    const uv_buf_t *header,
+		    struct raft_message *message,
+		    size_t *payload_len)
+{
+	unsigned i;
+	int rv = 0;
+
+	memset(message, 0, sizeof(*message));
+	message->type = (unsigned short)type;
+
+	*payload_len = 0;
+
+	/* Decode the header. */
+	switch (type) {
+		case RAFT_IO_REQUEST_VOTE:
+			decodeRequestVote(header, &message->request_vote);
+			break;
+		case RAFT_IO_REQUEST_VOTE_RESULT:
+			decodeRequestVoteResult(header,
+						&message->request_vote_result);
+			break;
+		case RAFT_IO_APPEND_ENTRIES:
+			rv = decodeAppendEntries(header,
+						 &message->append_entries);
+			for (i = 0; i < message->append_entries.n_entries;
+			     i++) {
+				*payload_len +=
+				    message->append_entries.entries[i].buf.len;
+			}
+			break;
+		case RAFT_IO_APPEND_ENTRIES_RESULT:
+			decodeAppendEntriesResult(
+			    header, &message->append_entries_result);
+			break;
+		case RAFT_IO_INSTALL_SNAPSHOT:
+			rv = decodeInstallSnapshot(header,
+						   &message->install_snapshot);
+			*payload_len += message->install_snapshot.data.len;
+			break;
+		case RAFT_IO_TIMEOUT_NOW:
+			decodeTimeoutNow(header, &message->timeout_now);
+			break;
+		default:
+			rv = RAFT_IOERR;
+			break;
+	};
+
+	return rv;
+}
+
+void uvDecodeEntriesBatch(uint8_t *batch,
+			  size_t offset,
+			  struct raft_entry *entries,
+			  unsigned n)
+{
+	uint8_t *cursor;
+	size_t i;
+
+	assert(batch != NULL);
+
+	cursor = batch + offset;
+
+	for (i = 0; i < n; i++) {
+		struct raft_entry *entry = &entries[i];
+		entry->batch = batch;
+
+		if (entry->buf.len == 0) {
+			entry->buf.base = NULL;
+			continue;
+		}
+
+		entry->buf.base = cursor;
+
+		cursor = cursor + entry->buf.len;
+		if (entry->buf.len % 8 != 0) {
+			/* Add padding */
+			cursor = cursor + 8 - (entry->buf.len % 8);
+		}
+	}
+}
diff --git a/src/raft/uv_encoding.h b/src/raft/uv_encoding.h
new file mode 100644
index 000000000..e0c2626e1
--- /dev/null
+++ b/src/raft/uv_encoding.h
@@ -0,0 +1,59 @@
+/* Encoding routines for the the libuv-based @raft_io backend. */
+
+#ifndef UV_ENCODING_H_
+#define UV_ENCODING_H_
+
+#include <uv.h>
+
+#include "../raft.h"
+
+/* Current disk format version. */
+#define UV__DISK_FORMAT 1
+
+int uvEncodeMessage(const struct raft_message *message,
+		    uv_buf_t **bufs,
+		    unsigned *n_bufs);
+
+int uvDecodeMessage(uint16_t type,
+		    const uv_buf_t *header,
+		    struct raft_message *message,
+		    size_t *payload_len);
+
+int uvDecodeBatchHeader(const void *batch,
+			struct raft_entry **entries,
+			unsigned *n);
+
+void uvDecodeEntriesBatch(uint8_t *batch,
+			  size_t offset,
+			  struct raft_entry *entries,
+			  unsigned n);
+
+/**
+ * The layout of the memory pointed at by a @batch pointer is the following:
+ *
+ * [8 bytes] Number of entries in the batch, little endian.
+ * [header1] Header data of the first entry of the batch.
+ * [  ...  ] More headers
+ * [headerN] Header data of the last entry of the batch.
+ * [data1  ] Payload data of the first entry of the batch.
+ * [  ...  ] More data
+ * [dataN  ] Payload data of the last entry of the batch.
+ *
+ * An entry header is 16-byte long and has the following layout:
+ *
+ * [8 bytes] Term in which the entry was created, little endian.
+ * [1 byte ] Message type (Either RAFT_COMMAND or RAFT_CHANGE)
+ * [3 bytes] Currently unused.
+ * [4 bytes] Size of the log entry data, little endian.
+ *
+ * A payload data section for an entry is simply a sequence of bytes of
+ * arbitrary lengths, possibly padded with extra bytes to reach 8-byte boundary
+ * (which means that all entry data pointers are 8-byte aligned).
+ */
+size_t uvSizeofBatchHeader(size_t n);
+
+void uvEncodeBatchHeader(const struct raft_entry *entries,
+			 unsigned n,
+			 void *buf);
+
+#endif /* UV_ENCODING_H_ */
diff --git a/src/raft/uv_finalize.c b/src/raft/uv_finalize.c
new file mode 100644
index 000000000..638b551ad
--- /dev/null
+++ b/src/raft/uv_finalize.c
@@ -0,0 +1,176 @@
+#include "assert.h"
+#include "heap.h"
+#include "queue.h"
+#include "uv.h"
+#include "uv_os.h"
+
+/* Metadata about an open segment not used anymore and that should be closed or
+ * remove (if not written at all). */
+struct uvDyingSegment
+{
+	struct uv *uv;
+	uvCounter counter;      /* Segment counter */
+	size_t used;            /* Number of used bytes */
+	raft_index first_index; /* Index of first entry */
+	raft_index last_index;  /* Index of last entry */
+	int status;             /* Status code of blocking syscalls */
+	queue queue;            /* Link to finalize queue */
+};
+
+/* Run all blocking syscalls involved in closing a used open segment.
+ *
+ * An open segment is closed by truncating its length to the number of bytes
+ * that were actually written into it and then renaming it. */
+static void uvFinalizeWorkCb(uv_work_t *work)
+{
+	struct uvDyingSegment *segment = work->data;
+	struct uv *uv = segment->uv;
+	char filename1[UV__FILENAME_LEN];
+	char filename2[UV__FILENAME_LEN];
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int rv;
+
+	sprintf(filename1, UV__OPEN_TEMPLATE, segment->counter);
+	sprintf(filename2, UV__CLOSED_TEMPLATE, segment->first_index,
+		segment->last_index);
+
+	tracef("finalize %s into %s", filename1, filename2);
+
+	/* If the segment hasn't actually been used (because the writer has been
+	 * closed or aborted before making any write), just remove it. */
+	if (segment->used == 0) {
+		tracef("remove unused segment file: %s", filename1);
+		rv = UvFsRemoveFile(uv->dir, filename1, errmsg);
+		if (rv != 0) {
+			goto err;
+		}
+		goto sync;
+	}
+
+	/* Truncate and rename the segment.*/
+	rv = UvFsTruncateAndRenameFile(uv->dir, segment->used, filename1,
+				       filename2, errmsg);
+	if (rv != 0) {
+		goto err;
+	}
+
+sync:
+	rv = UvFsSyncDir(uv->dir, errmsg);
+	if (rv != 0) {
+		goto err;
+	}
+
+	segment->status = 0;
+	return;
+
+err:
+	tracef("truncate segment %s: %s", filename1, errmsg);
+	assert(rv != 0);
+	segment->status = rv;
+}
+
+static int uvFinalizeStart(struct uvDyingSegment *segment);
+static void uvFinalizeAfterWorkCb(uv_work_t *work, int status)
+{
+	struct uvDyingSegment *segment = work->data;
+	struct uv *uv = segment->uv;
+	tracef("uv finalize after work segment %p cb status:%d",
+	       (void *)segment, status);
+	queue *head;
+	int rv;
+
+	assert(status == 0); /* We don't cancel worker requests */
+	uv->finalize_work.data = NULL;
+	if (segment->status != 0) {
+		uv->errored = true;
+	}
+	RaftHeapFree(segment);
+
+	/* If we have no more dismissed segments to close, check if there's a
+	 * barrier to unblock or if we are done closing. */
+	if (QUEUE_IS_EMPTY(&uv->finalize_reqs)) {
+		tracef("unblock barrier or close");
+		if (uv->barrier != NULL && UvBarrierReady(uv)) {
+			UvBarrierMaybeTrigger(uv->barrier);
+		}
+		uvMaybeFireCloseCb(uv);
+		return;
+	}
+
+	/* Grab a new dismissed segment to close. */
+	head = QUEUE_HEAD(&uv->finalize_reqs);
+	segment = QUEUE_DATA(head, struct uvDyingSegment, queue);
+	QUEUE_REMOVE(&segment->queue);
+
+	rv = uvFinalizeStart(segment);
+	if (rv != 0) {
+		RaftHeapFree(segment);
+		uv->errored = true;
+	}
+}
+
+/* Start finalizing an open segment. */
+static int uvFinalizeStart(struct uvDyingSegment *segment)
+{
+	struct uv *uv = segment->uv;
+	int rv;
+
+	assert(uv->finalize_work.data == NULL);
+	assert(segment->counter > 0);
+
+	uv->finalize_work.data = segment;
+
+	rv = uv_queue_work(uv->loop, &uv->finalize_work, uvFinalizeWorkCb,
+			   uvFinalizeAfterWorkCb);
+	if (rv != 0) {
+		ErrMsgPrintf(uv->io->errmsg,
+			     "start to truncate segment file %llu: %s",
+			     segment->counter, uv_strerror(rv));
+		return RAFT_IOERR;
+	}
+
+	return 0;
+}
+
+int UvFinalize(struct uv *uv,
+	       unsigned long long counter,
+	       size_t used,
+	       raft_index first_index,
+	       raft_index last_index)
+{
+	struct uvDyingSegment *segment;
+	int rv;
+
+	if (used > 0) {
+		assert(first_index > 0);
+		assert(last_index >= first_index);
+	}
+
+	segment = RaftHeapMalloc(sizeof *segment);
+	if (segment == NULL) {
+		return RAFT_NOMEM;
+	}
+
+	segment->uv = uv;
+	segment->counter = counter;
+	segment->used = used;
+	segment->first_index = first_index;
+	segment->last_index = last_index;
+
+	/* If we're already processing a segment, let's put the request in the
+	 * queue and wait. */
+	if (uv->finalize_work.data != NULL) {
+		QUEUE_PUSH(&uv->finalize_reqs, &segment->queue);
+		return 0;
+	}
+
+	rv = uvFinalizeStart(segment);
+	if (rv != 0) {
+		RaftHeapFree(segment);
+		return rv;
+	}
+
+	return 0;
+}
+
+#undef tracef
diff --git a/src/raft/uv_fs.c b/src/raft/uv_fs.c
new file mode 100644
index 000000000..d28ac9eec
--- /dev/null
+++ b/src/raft/uv_fs.c
@@ -0,0 +1,933 @@
+#include "uv_fs.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <sys/vfs.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "compress.h"
+#include "err.h"
+#include "heap.h"
+#include "uv_os.h"
+
+int UvFsCheckDir(const char *dir, char *errmsg)
+{
+	struct uv_fs_s req;
+	int rv;
+
+	/* Make sure we have a directory we can write into. */
+	rv = uv_fs_stat(NULL, &req, dir, NULL);
+	if (rv != 0) {
+		switch (rv) {
+			case UV_ENOENT:
+				ErrMsgPrintf((char *)errmsg,
+					     "directory '%s' does not exist",
+					     dir);
+				return RAFT_NOTFOUND;
+			case UV_EACCES:
+				ErrMsgPrintf((char *)errmsg,
+					     "can't access directory '%s'",
+					     dir);
+				return RAFT_UNAUTHORIZED;
+			case UV_ENOTDIR:
+				ErrMsgPrintf((char *)errmsg,
+					     "path '%s' is not a directory",
+					     dir);
+				return RAFT_INVALID;
+		}
+		ErrMsgPrintf((char *)errmsg, "can't stat '%s': %s", dir,
+			     uv_strerror(rv));
+		return RAFT_IOERR;
+	}
+
+	if (!(req.statbuf.st_mode & S_IFDIR)) {
+		ErrMsgPrintf((char *)errmsg, "path '%s' is not a directory",
+			     dir);
+		return RAFT_INVALID;
+	}
+
+	if (!(req.statbuf.st_mode & S_IWRITE)) {
+		ErrMsgPrintf((char *)errmsg, "directory '%s' is not writable",
+			     dir);
+		return RAFT_INVALID;
+	}
+
+	return 0;
+}
+
+int UvFsSyncDir(const char *dir, char *errmsg)
+{
+	uv_file fd;
+	int rv;
+	rv = UvOsOpen(dir, UV_FS_O_RDONLY | UV_FS_O_DIRECTORY, 0, &fd);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "open directory", rv);
+		return RAFT_IOERR;
+	}
+	rv = UvOsFsync(fd);
+	UvOsClose(fd);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "fsync directory", rv);
+		return RAFT_IOERR;
+	}
+	return 0;
+}
+
+int UvFsFileExists(const char *dir,
+		   const char *filename,
+		   bool *exists,
+		   char *errmsg)
+{
+	uv_stat_t sb;
+	char path[UV__PATH_SZ];
+	int rv;
+
+	rv = UvOsJoin(dir, filename, path);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+
+	rv = UvOsStat(path, &sb);
+	if (rv != 0) {
+		if (rv == UV_ENOENT) {
+			*exists = false;
+			goto out;
+		}
+		UvOsErrMsg(errmsg, "stat", rv);
+		return RAFT_IOERR;
+	}
+
+	*exists = true;
+
+out:
+	return 0;
+}
+
+/* Get the size of the given file. */
+int UvFsFileSize(const char *dir,
+		 const char *filename,
+		 off_t *size,
+		 char *errmsg)
+{
+	uv_stat_t sb;
+	char path[UV__PATH_SZ];
+	int rv;
+
+	rv = UvOsJoin(dir, filename, path);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+
+	rv = UvOsStat(path, &sb);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "stat", rv);
+		return RAFT_IOERR;
+	}
+	*size = (off_t)sb.st_size;
+
+	return 0;
+}
+
+int UvFsFileIsEmpty(const char *dir,
+		    const char *filename,
+		    bool *empty,
+		    char *errmsg)
+{
+	off_t size;
+	int rv;
+
+	rv = UvFsFileSize(dir, filename, &size, errmsg);
+	if (rv != 0) {
+		return rv;
+	}
+	*empty = size == 0 ? true : false;
+	return 0;
+}
+
+/* Open a file in a directory. */
+static int uvFsOpenFile(const char *dir,
+			const char *filename,
+			int flags,
+			int mode,
+			uv_file *fd,
+			char *errmsg)
+{
+	char path[UV__PATH_SZ];
+	int rv;
+	rv = UvOsJoin(dir, filename, path);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+	rv = UvOsOpen(path, flags, mode, fd);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "open", rv);
+		return RAFT_IOERR;
+	}
+	return 0;
+}
+
+int UvFsOpenFileForReading(const char *dir,
+			   const char *filename,
+			   uv_file *fd,
+			   char *errmsg)
+{
+	char path[UV__PATH_SZ];
+	int flags = O_RDONLY;
+	int rv;
+
+	rv = UvOsJoin(dir, filename, path);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+
+	return uvFsOpenFile(dir, filename, flags, 0, fd, errmsg);
+}
+
+int UvFsAllocateFile(const char *dir,
+		     const char *filename,
+		     size_t size,
+		     uv_file *fd,
+		     bool fallocate,
+		     char *errmsg)
+{
+	char path[UV__PATH_SZ];
+	int flags = O_WRONLY | O_CREAT | O_EXCL; /* Common open flags */
+	int rv = 0;
+
+	rv = UvOsJoin(dir, filename, path);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+
+	/* Allocate the desired size. */
+	if (fallocate) {
+		/* TODO: use RWF_DSYNC instead, if available. */
+		flags |= O_DSYNC;
+		rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd,
+				  errmsg);
+		if (rv != 0) {
+			goto err;
+		}
+		rv = UvOsFallocate(*fd, 0, (off_t)size);
+		if (rv == 0) {
+			return 0;
+		} else if (rv == UV_ENOSPC) {
+			ErrMsgPrintf(errmsg,
+				     "not enough space to allocate %zu bytes",
+				     size);
+			rv = RAFT_NOSPACE;
+			goto err_after_open;
+		} else {
+			UvOsErrMsg(errmsg, "posix_allocate", rv);
+			rv = RAFT_IOERR;
+			goto err_after_open;
+		}
+	} else {
+		/* Emulate fallocate, open without O_DSYNC, because we risk
+		 * doing a lot of synced writes. */
+		rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd,
+				  errmsg);
+		if (rv != 0) {
+			goto err;
+		}
+		rv = UvOsFallocateEmulation(*fd, 0, (off_t)size);
+		if (rv == UV_ENOSPC) {
+			ErrMsgPrintf(errmsg,
+				     "not enough space to allocate %zu bytes",
+				     size);
+			rv = RAFT_NOSPACE;
+			goto err_after_open;
+		} else if (rv != 0) {
+			ErrMsgPrintf(errmsg, "fallocate emulation %d", rv);
+			rv = RAFT_IOERR;
+			goto err_after_open;
+		}
+		rv = UvOsFsync(*fd);
+		if (rv != 0) {
+			ErrMsgPrintf(errmsg, "fsync %d", rv);
+			rv = RAFT_IOERR;
+			goto err_after_open;
+		}
+		/* Now close and reopen the file with O_DSYNC */
+		rv = UvOsClose(*fd);
+		if (rv != 0) {
+			ErrMsgPrintf(errmsg, "close %d", rv);
+			goto err_unlink;
+		}
+		/* TODO: use RWF_DSYNC instead, if available. */
+		flags = O_WRONLY | O_DSYNC;
+		rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, fd,
+				  errmsg);
+		if (rv != 0) {
+			goto err_unlink;
+		}
+	}
+
+	return 0;
+
+err_after_open:
+	UvOsClose(*fd);
+err_unlink:
+	UvOsUnlink(path);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+static int uvFsWriteFile(const char *dir,
+			 const char *filename,
+			 int flags,
+			 struct raft_buffer *bufs,
+			 unsigned n_bufs,
+			 char *errmsg)
+{
+	uv_file fd;
+	int rv;
+	size_t size;
+	unsigned i;
+	size = 0;
+	for (i = 0; i < n_bufs; i++) {
+		size += bufs[i].len;
+	}
+	rv = uvFsOpenFile(dir, filename, flags, S_IRUSR | S_IWUSR, &fd, errmsg);
+	if (rv != 0) {
+		goto err;
+	}
+	rv = UvOsWrite(fd, (const uv_buf_t *)bufs, n_bufs, 0);
+	if (rv != (int)(size)) {
+		if (rv < 0) {
+			UvOsErrMsg(errmsg, "write", rv);
+		} else {
+			ErrMsgPrintf(errmsg,
+				     "short write: %d only bytes written", rv);
+		}
+		goto err_after_file_open;
+	}
+	rv = UvOsFsync(fd);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "fsync", rv);
+		goto err_after_file_open;
+	}
+	rv = UvOsClose(fd);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "close", rv);
+		goto err;
+	}
+	return 0;
+
+err_after_file_open:
+	UvOsClose(fd);
+err:
+	return rv;
+}
+
+int UvFsMakeFile(const char *dir,
+		 const char *filename,
+		 struct raft_buffer *bufs,
+		 unsigned n_bufs,
+		 char *errmsg)
+{
+	int rv;
+	char tmp_filename[UV__FILENAME_LEN + 1] = {0};
+	char path[UV__PATH_SZ] = {0};
+	char tmp_path[UV__PATH_SZ] = {0};
+
+	/* Create a temp file with the given content
+	 * TODO as of libuv 1.34.0, use `uv_fs_mkstemp` */
+	size_t sz = sizeof(tmp_filename);
+	rv = snprintf(tmp_filename, sz, TMP_FILE_FMT, filename);
+	if (rv < 0 || rv >= (int)sz) {
+		return rv;
+	}
+	int flags = UV_FS_O_WRONLY | UV_FS_O_CREAT | UV_FS_O_EXCL;
+	rv = uvFsWriteFile(dir, tmp_filename, flags, bufs, n_bufs, errmsg);
+	if (rv != 0) {
+		goto err_after_tmp_create;
+	}
+
+	/* Check if the file exists */
+	bool exists = false;
+	rv = UvFsFileExists(dir, filename, &exists, errmsg);
+	if (rv != 0) {
+		goto err_after_tmp_create;
+	}
+	if (exists) {
+		rv = -1;
+		goto err_after_tmp_create;
+	}
+
+	/* Rename the temp file. Remark that there is a race between the
+	 * existence check and the rename, there is no `renameat2` equivalent in
+	 * libuv. However, in the current implementation this should pose no
+	 * problems.*/
+	rv = UvOsJoin(dir, tmp_filename, tmp_path);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+	rv = UvOsJoin(dir, filename, path);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+	rv = UvOsRename(tmp_path, path);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "rename", rv);
+		goto err_after_tmp_create;
+	}
+
+	rv = UvFsSyncDir(dir, errmsg);
+	if (rv != 0) {
+		char ignored[RAFT_ERRMSG_BUF_SIZE];
+		UvFsRemoveFile(dir, filename, ignored);
+		return rv;
+	}
+
+	return 0;
+
+err_after_tmp_create:
+	UvFsRemoveFile(dir, tmp_filename, errmsg);
+	return rv;
+}
+
+int UvFsMakeOrOverwriteFile(const char *dir,
+			    const char *filename,
+			    const struct raft_buffer *buf,
+			    char *errmsg)
+{
+	char path[UV__PATH_SZ];
+	int flags = UV_FS_O_WRONLY;
+	int mode = 0;
+	bool exists = true;
+	uv_file fd;
+	int rv;
+
+	rv = UvOsJoin(dir, filename, path);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+
+open:
+	rv = UvOsOpen(path, flags, mode, &fd);
+	if (rv != 0) {
+		if (rv == UV_ENOENT && !(flags & UV_FS_O_CREAT)) {
+			exists = false;
+			flags |= UV_FS_O_CREAT;
+			mode = S_IRUSR | S_IWUSR;
+			goto open;
+		}
+		goto err;
+	}
+
+	rv = UvOsWrite(fd, (const uv_buf_t *)buf, 1, 0);
+	if (rv != (int)(buf->len)) {
+		if (rv < 0) {
+			UvOsErrMsg(errmsg, "write", rv);
+		} else {
+			ErrMsgPrintf(errmsg,
+				     "short write: %d only bytes written", rv);
+		}
+		goto err_after_file_open;
+	}
+
+	if (exists) {
+		rv = UvOsFdatasync(fd);
+		if (rv != 0) {
+			UvOsErrMsg(errmsg, "fsync", rv);
+			goto err_after_file_open;
+		}
+	} else {
+		rv = UvOsFsync(fd);
+		if (rv != 0) {
+			UvOsErrMsg(errmsg, "fsync", rv);
+			goto err_after_file_open;
+		}
+	}
+
+	rv = UvOsClose(fd);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "close", rv);
+		goto err;
+	}
+
+	if (!exists) {
+		rv = UvFsSyncDir(dir, errmsg);
+		if (rv != 0) {
+			goto err;
+		}
+	}
+
+	return 0;
+
+err_after_file_open:
+	UvOsClose(fd);
+err:
+	return RAFT_IOERR;
+}
+
+int UvFsReadInto(uv_file fd, struct raft_buffer *buf, char *errmsg)
+{
+	ssize_t rv;
+	size_t offset = 0;
+
+	/* TODO: use uv_fs_read() */
+	while (offset < buf->len) {
+		rv = read(fd, (char *)buf->base + offset, buf->len - offset);
+		if (rv == -1) {
+			UvOsErrMsg(errmsg, "read", -errno);
+			return RAFT_IOERR;
+		}
+		/* EOF. Don't think this is reachable, but just make very sure
+		 * we don't loop forever. */
+		if (rv == 0) {
+			break;
+		}
+		assert(rv > 0);
+		offset += (size_t)rv;
+	}
+	if (offset < buf->len) {
+		ErrMsgPrintf(errmsg, "short read: %zu bytes instead of %zu",
+			     offset, buf->len);
+		return RAFT_IOERR;
+	}
+	return 0;
+}
+
+int UvFsReadFile(const char *dir,
+		 const char *filename,
+		 struct raft_buffer *buf,
+		 char *errmsg)
+{
+	uv_stat_t sb;
+	char path[UV__PATH_SZ];
+	uv_file fd;
+	int rv;
+
+	rv = UvOsJoin(dir, filename, path);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+
+	rv = UvOsStat(path, &sb);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "stat", rv);
+		rv = RAFT_IOERR;
+		goto err;
+	}
+
+	rv = uvFsOpenFile(dir, filename, O_RDONLY, 0, &fd, errmsg);
+	if (rv != 0) {
+		goto err;
+	}
+
+	buf->len = (size_t)sb.st_size;
+	buf->base = RaftHeapMalloc(buf->len);
+	if (buf->base == NULL) {
+		ErrMsgOom(errmsg);
+		rv = RAFT_NOMEM;
+		goto err_after_open;
+	}
+
+	rv = UvFsReadInto(fd, buf, errmsg);
+	if (rv != 0) {
+		goto err_after_buf_alloc;
+	}
+
+	UvOsClose(fd);
+
+	return 0;
+
+err_after_buf_alloc:
+	RaftHeapFree(buf->base);
+err_after_open:
+	UvOsClose(fd);
+err:
+	return rv;
+}
+
+int UvFsReadFileInto(const char *dir,
+		     const char *filename,
+		     struct raft_buffer *buf,
+		     char *errmsg)
+{
+	char path[UV__PATH_SZ];
+	uv_file fd;
+	int rv;
+
+	rv = UvOsJoin(dir, filename, path);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+
+	rv = uvFsOpenFile(dir, filename, O_RDONLY, 0, &fd, errmsg);
+	if (rv != 0) {
+		goto err;
+	}
+
+	rv = UvFsReadInto(fd, buf, errmsg);
+	if (rv != 0) {
+		goto err_after_open;
+	}
+
+	UvOsClose(fd);
+
+	return 0;
+
+err_after_open:
+	UvOsClose(fd);
+err:
+	return rv;
+}
+
+int UvFsRemoveFile(const char *dir, const char *filename, char *errmsg)
+{
+	char path[UV__PATH_SZ];
+	int rv;
+	rv = UvOsJoin(dir, filename, path);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+	rv = UvOsUnlink(path);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "unlink", rv);
+		return RAFT_IOERR;
+	}
+	return 0;
+}
+
+int UvFsRenameFile(const char *dir,
+		   const char *filename1,
+		   const char *filename2,
+		   char *errmsg)
+{
+	char path1[UV__PATH_SZ];
+	char path2[UV__PATH_SZ];
+	int rv;
+
+	rv = UvOsJoin(dir, filename1, path1);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+	rv = UvOsJoin(dir, filename2, path2);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+
+	rv = UvOsRename(path1, path2);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "rename", rv);
+		return rv;
+	}
+
+	return 0;
+}
+
+int UvFsTruncateAndRenameFile(const char *dir,
+			      size_t size,
+			      const char *filename1,
+			      const char *filename2,
+			      char *errmsg)
+{
+	char path1[UV__PATH_SZ];
+	char path2[UV__PATH_SZ];
+	uv_file fd;
+	int rv;
+
+	rv = UvOsJoin(dir, filename1, path1);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+	rv = UvOsJoin(dir, filename2, path2);
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+
+	/* Truncate and rename. */
+	rv = UvOsOpen(path1, UV_FS_O_RDWR, 0, &fd);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "open", rv);
+		goto err;
+	}
+	rv = UvOsTruncate(fd, (off_t)size);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "truncate", rv);
+		goto err_after_open;
+	}
+	rv = UvOsFsync(fd);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "fsync", rv);
+		goto err_after_open;
+	}
+	UvOsClose(fd);
+
+	rv = UvOsRename(path1, path2);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "rename", rv);
+		goto err;
+	}
+
+	return 0;
+
+err_after_open:
+	UvOsClose(fd);
+err:
+	return RAFT_IOERR;
+}
+
+/* Check if direct I/O is possible on the given fd. */
+static int probeDirectIO(int fd, size_t *size, char *errmsg)
+{
+	struct statfs fs_info; /* To check the file system type. */
+	void *buf;             /* Buffer to use for the probe write. */
+	int rv;
+
+	rv = UvOsSetDirectIo(fd);
+	if (rv != 0) {
+		if (rv != UV_EINVAL) {
+			/* UNTESTED: the parameters are ok, so this should never
+			 * happen. */
+			UvOsErrMsg(errmsg, "fnctl", rv);
+			return RAFT_IOERR;
+		}
+		rv = fstatfs(fd, &fs_info);
+		if (rv == -1) {
+			/* UNTESTED: in practice ENOMEM should be the only
+			 * failure mode */
+			UvOsErrMsg(errmsg, "fstatfs", -errno);
+			return RAFT_IOERR;
+		}
+		switch (fs_info.f_type) {
+			case 0x01021994: /* TMPFS_MAGIC */
+			case 0x2fc12fc1: /* ZFS magic */
+			case 0x24051905: /* UBIFS Support magic */
+				*size = 0;
+				return 0;
+			default:
+				/* UNTESTED: this is an unsupported file system.
+				 */
+#if defined(__s390x__)
+				ErrMsgPrintf(errmsg,
+					     "unsupported file system: %ux",
+					     fs_info.f_type);
+#else
+				ErrMsgPrintf(errmsg,
+					     "unsupported file system: %zx",
+					     fs_info.f_type);
+#endif
+				return RAFT_IOERR;
+		}
+	}
+
+	/* Try to perform direct I/O, using various buffer size. */
+	*size = 4096;
+	while (*size >= 512) {
+		buf = raft_aligned_alloc(*size, *size);
+		if (buf == NULL) {
+			ErrMsgOom(errmsg);
+			return RAFT_NOMEM;
+		}
+		memset(buf, 0, *size);
+		rv = (int)write(fd, buf, *size);
+		raft_aligned_free(*size, buf);
+		if (rv > 0) {
+			/* Since we fallocate'ed the file, we should never fail
+			 * because of lack of disk space, and all bytes should
+			 * have been written. */
+			assert(rv == (int)(*size));
+			return 0;
+		}
+		assert(rv == -1);
+		if (errno != EIO && errno != EOPNOTSUPP) {
+			/* UNTESTED: this should basically fail only because of
+			 * disk errors, since we allocated the file with
+			 * posix_fallocate. */
+
+			/* FIXME: this is a workaround because shiftfs doesn't
+			 * return EINVAL in the fnctl call above, for example
+			 * when the underlying fs is ZFS. */
+			if (errno == EINVAL && *size == 4096) {
+				*size = 0;
+				return 0;
+			}
+
+			UvOsErrMsg(errmsg, "write", -errno);
+			return RAFT_IOERR;
+		}
+		*size = *size / 2;
+	}
+
+	*size = 0;
+	return 0;
+}
+
+/* Check if fully non-blocking async I/O is possible on the given fd. */
+static int probeAsyncIO(int fd, size_t size, bool *ok, char *errmsg)
+{
+	void *buf;                  /* Buffer to use for the probe write */
+	aio_context_t ctx = 0;      /* KAIO context handle */
+	struct iocb iocb;           /* KAIO request object */
+	struct iocb *iocbs = &iocb; /* Because the io_submit() API sucks */
+	struct io_event event;      /* KAIO response object */
+	int n_events;
+	int rv;
+
+	/* Setup the KAIO context handle */
+	rv = UvOsIoSetup(1, &ctx);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "io_setup", rv);
+		/* UNTESTED: in practice this should fail only with ENOMEM */
+		return RAFT_IOERR;
+	}
+
+	/* Allocate the write buffer */
+	buf = raft_aligned_alloc(size, size);
+	if (buf == NULL) {
+		ErrMsgOom(errmsg);
+		return RAFT_NOMEM;
+	}
+	memset(buf, 0, size);
+
+	/* Prepare the KAIO request object */
+	memset(&iocb, 0, sizeof iocb);
+	iocb.aio_lio_opcode = IOCB_CMD_PWRITE;
+	*((void **)(&iocb.aio_buf)) = buf;
+	iocb.aio_nbytes = size;
+	iocb.aio_offset = 0;
+	iocb.aio_fildes = (uint32_t)fd;
+	iocb.aio_reqprio = 0;
+	iocb.aio_rw_flags |= RWF_NOWAIT | RWF_DSYNC;
+
+	/* Submit the KAIO request */
+	rv = UvOsIoSubmit(ctx, 1, &iocbs);
+	if (rv != 0) {
+		/* UNTESTED: in practice this should fail only with ENOMEM */
+		raft_aligned_free(size, buf);
+		UvOsIoDestroy(ctx);
+		/* On ZFS 0.8 this is not properly supported yet. Also, when
+		 * running on older kernels a binary compiled on a kernel with
+		 * RWF_NOWAIT support, we might get EINVAL. */
+		if (errno == EOPNOTSUPP || errno == EINVAL) {
+			*ok = false;
+			return 0;
+		}
+		UvOsErrMsg(errmsg, "io_submit", rv);
+		return RAFT_IOERR;
+	}
+
+	/* Fetch the response: will block until done. */
+	n_events = UvOsIoGetevents(ctx, 1, 1, &event, NULL);
+	assert(n_events == 1);
+	if (n_events != 1) {
+		/* UNTESTED */
+		UvOsErrMsg(errmsg, "UvOsIoGetevents", n_events);
+		return RAFT_IOERR;
+	}
+
+	/* Release the write buffer. */
+	raft_aligned_free(size, buf);
+
+	/* Release the KAIO context handle. */
+	rv = UvOsIoDestroy(ctx);
+	if (rv != 0) {
+		UvOsErrMsg(errmsg, "io_destroy", rv);
+		return RAFT_IOERR;
+	}
+
+	if (event.res > 0) {
+		assert(event.res == (int)size);
+		*ok = true;
+	} else {
+		/* UNTESTED: this should basically fail only because of disk
+		 * errors, since we allocated the file with posix_fallocate and
+		 * the block size is supposed to be correct. */
+		*ok = false;
+		if (event.res == -EAGAIN) {
+			/* If EAGAIN is encountered we assume the functionality
+			 * is supported but this write would have blocked for
+			 * some reason. UvWriter has a fallback mechanism to
+			 * schedule writes on the thread pool in case the async
+			 * write fails with EAGAIN, so this is safe. */
+			*ok = true;
+		}
+	}
+
+	return 0;
+}
+
+#define UV__FS_PROBE_FALLOCATE_FILE ".probe_fallocate"
+/* Leave detection of other error conditions to other probe* functions, only
+ * bother checking if posix_fallocate returns success. */
+static void probeFallocate(const char *dir, bool *fallocate)
+{
+	int flags = O_WRONLY | O_CREAT | O_EXCL; /* Common open flags */
+	char ignored[RAFT_ERRMSG_BUF_SIZE];
+	int rv = 0;
+	int fd = -1;
+
+	*fallocate = false;
+	UvFsRemoveFile(dir, UV__FS_PROBE_FALLOCATE_FILE, ignored);
+	rv = uvFsOpenFile(dir, UV__FS_PROBE_FALLOCATE_FILE, flags,
+			  S_IRUSR | S_IWUSR, &fd, ignored);
+	if (rv != 0) {
+		goto out;
+	}
+	rv = UvOsFallocate(fd, 0, (off_t)4096);
+	if (rv == 0) {
+		*fallocate = true;
+	}
+
+out:
+	UvFsRemoveFile(dir, UV__FS_PROBE_FALLOCATE_FILE, ignored);
+}
+
+#define UV__FS_PROBE_FILE ".probe"
+#define UV__FS_PROBE_FILE_SIZE 4096
+int UvFsProbeCapabilities(const char *dir,
+			  size_t *direct,
+			  bool *async,
+			  bool *fallocate,
+			  char *errmsg)
+{
+	int fd; /* File descriptor of the probe file */
+	int rv;
+	char ignored[RAFT_ERRMSG_BUF_SIZE];
+
+	probeFallocate(dir, fallocate);
+
+	/* Create a temporary probe file. */
+	UvFsRemoveFile(dir, UV__FS_PROBE_FILE, ignored);
+	rv = UvFsAllocateFile(dir, UV__FS_PROBE_FILE, UV__FS_PROBE_FILE_SIZE,
+			      &fd, *fallocate, errmsg);
+	if (rv != 0) {
+		ErrMsgWrapf(errmsg, "create I/O capabilities probe file");
+		goto err;
+	}
+	UvFsRemoveFile(dir, UV__FS_PROBE_FILE, ignored);
+
+	/* Check if we can use direct I/O. */
+	rv = probeDirectIO(fd, direct, errmsg);
+	if (rv != 0) {
+		ErrMsgWrapf(errmsg, "probe Direct I/O");
+		goto err_after_file_open;
+	}
+
+	/* If direct I/O is not possible, we can't perform fully asynchronous
+	 * I/O, because io_submit might potentially block. */
+	if (*direct == 0) {
+		*async = false;
+		goto out;
+	}
+	rv = probeAsyncIO(fd, *direct, async, errmsg);
+	if (rv != 0) {
+		ErrMsgWrapf(errmsg, "probe Async I/O");
+		goto err_after_file_open;
+	}
+
+out:
+	close(fd);
+	return 0;
+
+err_after_file_open:
+	close(fd);
+err:
+	return rv;
+}
diff --git a/src/raft/uv_fs.h b/src/raft/uv_fs.h
new file mode 100644
index 000000000..8e8159f9d
--- /dev/null
+++ b/src/raft/uv_fs.h
@@ -0,0 +1,121 @@
+/* File system related utilities. */
+
+#ifndef UV_FS_H_
+#define UV_FS_H_
+
+#include <uv.h>
+
+#include "../raft.h"
+#include "err.h"
+
+#define TMP_FILE_PREFIX "tmp-"
+#define TMP_FILE_FMT TMP_FILE_PREFIX "%s"
+
+/* Check that the given directory can be used. */
+int UvFsCheckDir(const char *dir, char *errmsg);
+
+/* Sync the given directory by calling fsync(). */
+int UvFsSyncDir(const char *dir, char *errmsg);
+
+/* Check whether a the given file exists. */
+int UvFsFileExists(const char *dir,
+		   const char *filename,
+		   bool *exists,
+		   char *errmsg);
+
+/* Get the size of the given file. */
+int UvFsFileSize(const char *dir,
+		 const char *filename,
+		 off_t *size,
+		 char *errmsg);
+
+/* Check whether the given file in the given directory is empty. */
+int UvFsFileIsEmpty(const char *dir,
+		    const char *filename,
+		    bool *empty,
+		    char *errmsg);
+
+/* Create the given file in the given directory and allocate the given size to
+ * it, returning its file descriptor. The file must not exist yet. */
+int UvFsAllocateFile(const char *dir,
+		     const char *filename,
+		     size_t size,
+		     uv_file *fd,
+		     bool fallocate,
+		     char *errmsg);
+
+/* Create a file and write the given content into it. */
+int UvFsMakeFile(const char *dir,
+		 const char *filename,
+		 struct raft_buffer *bufs,
+		 unsigned n_bufs,
+		 char *errmsg);
+
+/* Create or overwrite a file.
+ *
+ * If the file does not exists yet, it gets created, the given content written
+ * to it, and then fully persisted to disk by fsync()'ing the file and the
+ * dir.
+ *
+ * If the file already exists, it gets overwritten. The assumption is that the
+ * file size will stay the same and its content will change, so only fdatasync()
+ * will be used */
+int UvFsMakeOrOverwriteFile(const char *dir,
+			    const char *filename,
+			    const struct raft_buffer *buf,
+			    char *errmsg);
+
+/* Open a file for reading. */
+int UvFsOpenFileForReading(const char *dir,
+			   const char *filename,
+			   uv_file *fd,
+			   char *errmsg);
+
+/* Read exactly buf->len bytes from the given file descriptor into
+   buf->base. Fail if less than buf->len bytes are read. */
+int UvFsReadInto(uv_file fd, struct raft_buffer *buf, char *errmsg);
+
+/* Read all the content of the given file. */
+int UvFsReadFile(const char *dir,
+		 const char *filename,
+		 struct raft_buffer *buf,
+		 char *errmsg);
+
+/* Read exactly buf->len bytes from the given file into buf->base. Fail if less
+ * than buf->len bytes are read. */
+int UvFsReadFileInto(const char *dir,
+		     const char *filename,
+		     struct raft_buffer *buf,
+		     char *errmsg);
+
+/* Synchronously remove a file, calling the unlink() system call. */
+int UvFsRemoveFile(const char *dir, const char *filename, char *errmsg);
+
+/* Synchronously truncate a file to the given size and then rename it. */
+int UvFsTruncateAndRenameFile(const char *dir,
+			      size_t size,
+			      const char *filename1,
+			      const char *filename2,
+			      char *errmsg);
+
+/* Synchronously rename a file. */
+int UvFsRenameFile(const char *dir,
+		   const char *filename1,
+		   const char *filename2,
+		   char *errmsg);
+
+/* Return information about the I/O capabilities of the underlying file
+ * system.
+ *
+ * The @direct parameter will be set to zero if direct I/O is not possible, or
+ * to the block size to use for direct I/O otherwise.
+ *
+ * The @async parameter will be set to true if fully asynchronous I/O is
+ * possible using the KAIO API. */
+int UvFsProbeCapabilities(const char *dir,
+			  size_t *direct,
+			  bool *async,
+			  bool *fallocate,
+			  char *errmsg);
+
+#endif /* UV_FS_H_ */
diff --git a/src/raft/uv_ip.c b/src/raft/uv_ip.c
new file mode 100644
index 000000000..4e4ff9f3f
--- /dev/null
+++ b/src/raft/uv_ip.c
@@ -0,0 +1,86 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include <uv.h>
+
+#include "../raft.h"
+
+#include "uv_ip.h"
+
+static const char *strCpyUntil(char *target,
+			       const char *source,
+			       size_t target_size,
+			       char separator)
+{
+	size_t i;
+	for (i = 0; i < target_size; ++i) {
+		if (!source[i] || source[i] == separator) {
+			target[i] = 0;
+			return source + i;
+		} else {
+			target[i] = source[i];
+		}
+	}
+	return NULL;
+}
+
+int uvIpAddrSplit(const char *address,
+		  char *host,
+		  size_t host_size,
+		  char *service,
+		  size_t service_size)
+{
+	char colon = ':';
+	const char *service_ptr = NULL;
+
+	if (host) {
+		service_ptr = strCpyUntil(host, address, host_size, colon);
+		if (!service_ptr) {
+			return RAFT_NAMETOOLONG;
+		}
+	}
+	if (service) {
+		if (!service_ptr) {
+			service_ptr = strchr(address, colon);
+		}
+		if (!service_ptr || *service_ptr == 0 ||
+		    *(++service_ptr) == 0) {
+			service_ptr = "8080";
+		}
+		if (!strCpyUntil(service, service_ptr, service_size, 0)) {
+			return RAFT_NAMETOOLONG;
+		}
+	}
+	return 0;
+}
+
+/* Synchronoues resolve hostname to IP address */
+int uvIpResolveBindAddresses(const char *address, struct addrinfo **ai_result)
+{
+	static struct addrinfo hints = {
+	    .ai_flags = AI_ADDRCONFIG | AI_PASSIVE | AI_NUMERICSERV,
+	    .ai_family = AF_INET,
+	    .ai_socktype = SOCK_STREAM,
+	    .ai_protocol = 0};
+	char hostname[NI_MAXHOST];
+	char service[NI_MAXSERV];
+	int rv;
+
+	rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service,
+			   sizeof(service));
+	if (rv != 0) {
+		return rv;
+	}
+
+	if (hostname[0]) {
+		rv = getaddrinfo(hostname, service, &hints, ai_result);
+	} else {
+		rv = getaddrinfo(NULL, service, &hints, ai_result);
+	}
+
+	if (rv != 0) {
+		return RAFT_IOERR;
+	}
+
+	return 0;
+}
diff --git a/src/raft/uv_ip.h b/src/raft/uv_ip.h
new file mode 100644
index 000000000..8cda2b91c
--- /dev/null
+++ b/src/raft/uv_ip.h
@@ -0,0 +1,20 @@
+/* IP-related utils. */
+
+#ifndef UV_IP_H_
+#define UV_IP_H_
+
+#include <netinet/in.h>
+
+/* Split @address into @host and @service. */
+int uvIpAddrSplit(const char *address,
+		  char *host,
+		  size_t host_size,
+		  char *service,
+		  size_t service_size);
+
+struct addrinfo;
+
+/* Synchronous resolve hostname to IP address */
+int uvIpResolveBindAddresses(const char *address, struct addrinfo **ai_result);
+
+#endif /* UV_IP_H */
diff --git a/src/raft/uv_list.c b/src/raft/uv_list.c
new file mode 100644
index 000000000..18639db36
--- /dev/null
+++ b/src/raft/uv_list.c
@@ -0,0 +1,116 @@
+#include <string.h>
+
+#include "assert.h"
+#include "uv.h"
+
+static const char *uvListIgnored[] = {".", "..", "metadata1", "metadata2",
+				      NULL};
+
+/* Return true if the given filename should be ignored. */
+static bool uvListShouldIgnore(const char *filename)
+{
+	const char **cursor = uvListIgnored;
+	bool result = false;
+	if (strlen(filename) >= UV__FILENAME_LEN) {
+		return true;
+	}
+	while (*cursor != NULL) {
+		if (strcmp(filename, *cursor) == 0) {
+			result = true;
+			break;
+		}
+		cursor++;
+	}
+	return result;
+}
+
+int UvList(struct uv *uv,
+	   struct uvSnapshotInfo *snapshots[],
+	   size_t *n_snapshots,
+	   struct uvSegmentInfo *segments[],
+	   size_t *n_segments,
+	   char *errmsg)
+{
+	struct uv_fs_s req;
+	struct uv_dirent_s entry;
+	int n;
+	int i;
+	int rv;
+	int rv2;
+
+	n = uv_fs_scandir(NULL, &req, uv->dir, 0, NULL);
+	if (n < 0) {
+		ErrMsgPrintf(errmsg, "scan data directory: %s", uv_strerror(n));
+		return RAFT_IOERR;
+	}
+
+	*snapshots = NULL;
+	*n_snapshots = 0;
+
+	*segments = NULL;
+	*n_segments = 0;
+
+	rv = 0;
+
+	for (i = 0; i < n; i++) {
+		const char *filename;
+		bool appended;
+
+		rv = uv_fs_scandir_next(&req, &entry);
+		assert(rv == 0); /* Can't fail in libuv */
+
+		filename = entry.name;
+
+		/* If an error occurred while processing a preceeding entry or
+		 * if we know that this is not a segment filename, just free it
+		 * and skip to the next one. */
+		if (rv != 0 || uvListShouldIgnore(filename)) {
+			if (rv == 0) {
+				tracef("ignore %s", filename);
+			}
+			continue;
+		}
+
+		/* Append to the snapshot list if it's a snapshot metadata
+		 * filename and a valid associated snapshot file exists. */
+		rv = UvSnapshotInfoAppendIfMatch(uv, filename, snapshots,
+						 n_snapshots, &appended);
+		if (appended || rv != 0) {
+			if (rv == 0) {
+				tracef("snapshot %s", filename);
+			}
+			continue;
+		}
+
+		/* Append to the segment list if it's a segment filename */
+		rv = uvSegmentInfoAppendIfMatch(entry.name, segments,
+						n_segments, &appended);
+		if (appended || rv != 0) {
+			if (rv == 0) {
+				tracef("segment %s", filename);
+			}
+			continue;
+		}
+
+		tracef("ignore %s", filename);
+	}
+
+	rv2 = uv_fs_scandir_next(&req, &entry);
+	assert(rv2 == UV_EOF);
+
+	if (rv != 0 && *segments != NULL) {
+		raft_free(*segments);
+	}
+
+	if (*snapshots != NULL) {
+		UvSnapshotSort(*snapshots, *n_snapshots);
+	}
+
+	if (*segments != NULL) {
+		uvSegmentSort(*segments, *n_segments);
+	}
+
+	return rv;
+}
+
+#undef tracef
diff --git a/src/raft/uv_metadata.c b/src/raft/uv_metadata.c
new file mode 100644
index 000000000..aee87323f
--- /dev/null
+++ b/src/raft/uv_metadata.c
@@ -0,0 +1,204 @@
+#include "assert.h"
+#include "byte.h"
+#include "uv.h"
+#include "uv_encoding.h"
+
+/* We have metadata1 and metadata2. */
+#define METADATA_FILENAME_PREFIX "metadata"
+#define METADATA_FILENAME_SIZE (sizeof(METADATA_FILENAME_PREFIX) + 2)
+
+/* Format, version, term, vote */
+#define METADATA_CONTENT_SIZE (8 * 4)
+
+/* Encode the content of a metadata file. */
+static void uvMetadataEncode(const struct uvMetadata *metadata, void *buf)
+{
+	void *cursor = buf;
+	bytePut64(&cursor, UV__DISK_FORMAT);
+	bytePut64(&cursor, metadata->version);
+	bytePut64(&cursor, metadata->term);
+	bytePut64(&cursor, metadata->voted_for);
+}
+
+/* Decode the content of a metadata file. */
+static int uvMetadataDecode(const void *buf,
+			    struct uvMetadata *metadata,
+			    char *errmsg)
+{
+	const void *cursor = buf;
+	uint64_t format;
+	format = byteGet64(&cursor);
+	if (format != UV__DISK_FORMAT) {
+		ErrMsgPrintf(errmsg, "bad format version %ju", format);
+		return RAFT_MALFORMED;
+	}
+	metadata->version = byteGet64(&cursor);
+	metadata->term = byteGet64(&cursor);
+	metadata->voted_for = byteGet64(&cursor);
+
+	/* Coherence checks that values make sense */
+	if (metadata->version == 0) {
+		ErrMsgPrintf(errmsg, "version is set to zero");
+		return RAFT_CORRUPT;
+	}
+
+	return 0;
+}
+
+/* Render the filename of the metadata file with index @n. */
+static void uvMetadataFilename(const unsigned short n, char *filename)
+{
+	sprintf(filename, METADATA_FILENAME_PREFIX "%d", n);
+}
+
+/* Read the n'th metadata file (with n equal to 1 or 2) and decode the content
+ * of the file, populating the given metadata buffer accordingly. */
+static int uvMetadataLoadN(const char *dir,
+			   const unsigned short n,
+			   struct uvMetadata *metadata,
+			   char *errmsg)
+{
+	char filename[METADATA_FILENAME_SIZE];  /* Filename of the metadata file
+						 */
+	uint8_t content[METADATA_CONTENT_SIZE]; /* Content of metadata file */
+	off_t size;
+	struct raft_buffer buf;
+	bool exists;
+	int rv;
+
+	assert(n == 1 || n == 2);
+
+	/* Render the metadata path */
+	uvMetadataFilename(n, filename);
+
+	rv = UvFsFileExists(dir, filename, &exists, errmsg);
+	if (rv != 0) {
+		ErrMsgWrapf(errmsg, "check if %s exists", filename);
+		return rv;
+	}
+
+	memset(metadata, 0, sizeof *metadata);
+
+	/* If the file does not exist, just return. */
+	if (!exists) {
+		return 0;
+	}
+
+	/* If the file exists but has less bytes than expected assume that the
+	 * server crashed while writing this metadata file, and pretend it has
+	 * not been written at all. If it has more file than expected, return an
+	 * error. */
+	rv = UvFsFileSize(dir, filename, &size, errmsg);
+	if (rv != 0) {
+		ErrMsgWrapf(errmsg, "check size of %s", filename);
+		return rv;
+	}
+
+	if (size != sizeof content) {
+		if ((size_t)size < sizeof content) {
+			rv = UvFsRemoveFile(dir, filename, errmsg);
+			if (rv != 0) {
+				return rv;
+			}
+			return 0;
+		}
+		ErrMsgPrintf(errmsg, "%s has size %jd instead of %zu", filename,
+			     (intmax_t)size, sizeof content);
+		return RAFT_CORRUPT;
+	}
+
+	/* Read the content of the metadata file. */
+	buf.base = content;
+	buf.len = sizeof content;
+
+	rv = UvFsReadFileInto(dir, filename, &buf, errmsg);
+	if (rv != 0) {
+		ErrMsgWrapf(errmsg, "read content of %s", filename);
+		return rv;
+	};
+
+	/* Decode the content of the metadata file. */
+	rv = uvMetadataDecode(content, metadata, errmsg);
+	if (rv != 0) {
+		ErrMsgWrapf(errmsg, "decode content of %s", filename);
+		return rv;
+	}
+
+	return 0;
+}
+
+int uvMetadataLoad(const char *dir, struct uvMetadata *metadata, char *errmsg)
+{
+	struct uvMetadata metadata1;
+	struct uvMetadata metadata2;
+	int rv;
+
+	/* Read the two metadata files (if available). */
+	rv = uvMetadataLoadN(dir, 1, &metadata1, errmsg);
+	if (rv != 0) {
+		return rv;
+	}
+	rv = uvMetadataLoadN(dir, 2, &metadata2, errmsg);
+	if (rv != 0) {
+		return rv;
+	}
+
+	/* Check the versions. */
+	if (metadata1.version == 0 && metadata2.version == 0) {
+		/* Neither metadata file exists: have a brand new server. */
+		metadata->version = 0;
+		metadata->term = 0;
+		metadata->voted_for = 0;
+	} else if (metadata1.version == metadata2.version) {
+		/* The two metadata files can't have the same version. */
+		ErrMsgPrintf(errmsg,
+			     "metadata1 and metadata2 are both at version %llu",
+			     metadata1.version);
+		return RAFT_CORRUPT;
+	} else {
+		/* Pick the metadata with the grater version. */
+		if (metadata1.version > metadata2.version) {
+			*metadata = metadata1;
+		} else {
+			*metadata = metadata2;
+		}
+	}
+
+	return 0;
+}
+
+/* Return the metadata file index associated with the given version. */
+static unsigned short uvMetadataFileIndex(unsigned long long version)
+{
+	return version % 2 == 1 ? 1 : 2;
+}
+
+int uvMetadataStore(struct uv *uv, const struct uvMetadata *metadata)
+{
+	char filename[METADATA_FILENAME_SIZE];  /* Filename of the metadata file
+						 */
+	uint8_t content[METADATA_CONTENT_SIZE]; /* Content of metadata file */
+	struct raft_buffer buf;
+	unsigned short n;
+	int rv;
+
+	assert(metadata->version > 0);
+
+	/* Encode the given metadata. */
+	uvMetadataEncode(metadata, content);
+
+	/* Render the metadata file name. */
+	n = uvMetadataFileIndex(metadata->version);
+	uvMetadataFilename(n, filename);
+
+	/* Write the metadata file, creating it if it does not exist. */
+	buf.base = content;
+	buf.len = sizeof content;
+	rv = UvFsMakeOrOverwriteFile(uv->dir, filename, &buf, uv->io->errmsg);
+	if (rv != 0) {
+		ErrMsgWrapf(uv->io->errmsg, "persist %s", filename);
+		return rv;
+	}
+
+	return 0;
+}
diff --git a/src/raft/uv_os.c b/src/raft/uv_os.c
new file mode 100644
index 000000000..8a96ab130
--- /dev/null
+++ b/src/raft/uv_os.c
@@ -0,0 +1,222 @@
+#include "uv_os.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/eventfd.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+#include <unistd.h>
+#include <uv.h>
+
+#include "assert.h"
+#include "err.h"
+#include "syscall.h"
+
+/* Default permissions when creating a directory. */
+#define DEFAULT_DIR_PERM 0700
+
+int UvOsOpen(const char *path, int flags, int mode, uv_file *fd)
+{
+	struct uv_fs_s req;
+	int rv;
+	rv = uv_fs_open(NULL, &req, path, flags, mode, NULL);
+	if (rv < 0) {
+		return rv;
+	}
+	*fd = rv;
+	return 0;
+}
+
+int UvOsClose(uv_file fd)
+{
+	struct uv_fs_s req;
+	return uv_fs_close(NULL, &req, fd, NULL);
+}
+
+/* Emulate fallocate(). Mostly taken from glibc's implementation. */
+int UvOsFallocateEmulation(int fd, off_t offset, off_t len)
+{
+	ssize_t increment;
+	struct statfs f;
+	int rv;
+
+	rv = fstatfs(fd, &f);
+	if (rv != 0) {
+		return -errno;
+	}
+
+	if (f.f_bsize == 0) {
+		increment = 512;
+	} else if (f.f_bsize < 4096) {
+		increment = f.f_bsize;
+	} else {
+		increment = 4096;
+	}
+
+	for (offset += (len - 1) % increment; len > 0; offset += increment) {
+		len -= increment;
+		rv = (int)pwrite(fd, "", 1, offset);
+		if (rv != 1) {
+			return -errno;
+		}
+	}
+
+	return 0;
+}
+
+int UvOsFallocate(uv_file fd, off_t offset, off_t len)
+{
+	int rv;
+	rv = posix_fallocate(fd, offset, len);
+	if (rv != 0) {
+		/* From the manual page:
+		 *
+		 *   posix_fallocate() returns zero on success, or an error
+		 * number on failure.  Note that errno is not set.
+		 */
+		return -rv;
+	}
+	return 0;
+}
+
+int UvOsTruncate(uv_file fd, off_t offset)
+{
+	struct uv_fs_s req;
+	return uv_fs_ftruncate(NULL, &req, fd, offset, NULL);
+}
+
+int UvOsFsync(uv_file fd)
+{
+	struct uv_fs_s req;
+	return uv_fs_fsync(NULL, &req, fd, NULL);
+}
+
+int UvOsFdatasync(uv_file fd)
+{
+	struct uv_fs_s req;
+	return uv_fs_fdatasync(NULL, &req, fd, NULL);
+}
+
+int UvOsStat(const char *path, uv_stat_t *sb)
+{
+	struct uv_fs_s req;
+	int rv;
+	rv = uv_fs_stat(NULL, &req, path, NULL);
+	if (rv != 0) {
+		return rv;
+	}
+	memcpy(sb, &req.statbuf, sizeof *sb);
+	return 0;
+}
+
+int UvOsWrite(uv_file fd,
+	      const uv_buf_t bufs[],
+	      unsigned int nbufs,
+	      int64_t offset)
+{
+	struct uv_fs_s req;
+	return uv_fs_write(NULL, &req, fd, bufs, nbufs, offset, NULL);
+}
+
+int UvOsUnlink(const char *path)
+{
+	struct uv_fs_s req;
+	return uv_fs_unlink(NULL, &req, path, NULL);
+}
+
+int UvOsRename(const char *path1, const char *path2)
+{
+	struct uv_fs_s req;
+	return uv_fs_rename(NULL, &req, path1, path2, NULL);
+}
+
+int UvOsJoin(const char *dir, const char *filename, char *path)
+{
+	if (!UV__DIR_HAS_VALID_LEN(dir) ||
+	    !UV__FILENAME_HAS_VALID_LEN(filename)) {
+		return -1;
+	}
+	strcpy(path, dir);
+	strcat(path, "/");
+	strcat(path, filename);
+	return 0;
+}
+
+int UvOsIoSetup(unsigned nr, aio_context_t *ctxp)
+{
+	int rv;
+	rv = io_setup(nr, ctxp);
+	if (rv == -1) {
+		return -errno;
+	}
+	return 0;
+}
+
+int UvOsIoDestroy(aio_context_t ctx)
+{
+	int rv;
+	rv = io_destroy(ctx);
+	if (rv == -1) {
+		return -errno;
+	}
+	return 0;
+}
+
+int UvOsIoSubmit(aio_context_t ctx, long nr, struct iocb **iocbpp)
+{
+	int rv;
+	rv = io_submit(ctx, nr, iocbpp);
+	if (rv == -1) {
+		return -errno;
+	}
+	assert(rv == nr); /* TODO: can something else be returned? */
+	return 0;
+}
+
+int UvOsIoGetevents(aio_context_t ctx,
+		    long min_nr,
+		    long max_nr,
+		    struct io_event *events,
+		    struct timespec *timeout)
+{
+	int rv;
+	do {
+		rv = io_getevents(ctx, min_nr, max_nr, events, timeout);
+	} while (rv == -1 && errno == EINTR);
+
+	if (rv == -1) {
+		return -errno;
+	}
+	assert(rv >= min_nr);
+	assert(rv <= max_nr);
+	return rv;
+}
+
+int UvOsEventfd(unsigned int initval, int flags)
+{
+	int rv;
+	/* At the moment only UV_FS_O_NONBLOCK is supported */
+	assert(flags == UV_FS_O_NONBLOCK);
+	flags = EFD_NONBLOCK | EFD_CLOEXEC;
+	rv = eventfd(initval, flags);
+	if (rv == -1) {
+		return -errno;
+	}
+	return rv;
+}
+
+int UvOsSetDirectIo(uv_file fd)
+{
+	int flags; /* Current fcntl flags */
+	int rv;
+	flags = fcntl(fd, F_GETFL);
+	rv = fcntl(fd, F_SETFL, flags | UV_FS_O_DIRECT);
+	if (rv == -1) {
+		return -errno;
+	}
+	return 0;
+}
diff --git a/src/raft/uv_os.h b/src/raft/uv_os.h
new file mode 100644
index 000000000..741dd8887
--- /dev/null
+++ b/src/raft/uv_os.h
@@ -0,0 +1,95 @@
+/* Operating system related utilities. */
+
+#ifndef UV_OS_H_
+#define UV_OS_H_
+
+#include <fcntl.h>
+#include <linux/aio_abi.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <uv.h>
+
+/* Maximum size of a full file system path string. */
+#define UV__PATH_SZ 1024
+
+/* Maximum length of a filename string. */
+#define UV__FILENAME_LEN 128
+
+/* Length of path separator. */
+#define UV__SEP_LEN 1 /* strlen("/") */
+
+/* True if STR's length is at most LEN. */
+#define LEN_AT_MOST_(STR, LEN) (strnlen(STR, LEN + 1) <= LEN)
+
+/* Maximum length of a directory path string. */
+#define UV__DIR_LEN (UV__PATH_SZ - UV__SEP_LEN - UV__FILENAME_LEN - 1)
+
+/* True if the given DIR string has at most UV__DIR_LEN chars. */
+#define UV__DIR_HAS_VALID_LEN(DIR) LEN_AT_MOST_(DIR, UV__DIR_LEN)
+
+/* True if the given FILENAME string has at most UV__FILENAME_LEN chars. */
+#define UV__FILENAME_HAS_VALID_LEN(FILENAME) \
+	LEN_AT_MOST_(FILENAME, UV__FILENAME_LEN)
+
+/* Portable open() */
+int UvOsOpen(const char *path, int flags, int mode, uv_file *fd);
+
+/* Portable close() */
+int UvOsClose(uv_file fd);
+
+/* TODO: figure a portable abstraction. */
+int UvOsFallocate(uv_file fd, off_t offset, off_t len);
+
+/* Emulation to use in case UvOsFallocate fails with -EONOTSUPP.
+ * This might happen with a libc implementation (e.g. musl) that
+ * doesn't implement a transparent fallback if fallocate() is
+ * not supported by the underlying file system. */
+int UvOsFallocateEmulation(int fd, off_t offset, off_t len);
+
+/* Portable truncate() */
+int UvOsTruncate(uv_file fd, off_t offset);
+
+/* Portable fsync() */
+int UvOsFsync(uv_file fd);
+
+/* Portable fdatasync() */
+int UvOsFdatasync(uv_file fd);
+
+/* Portable stat() */
+int UvOsStat(const char *path, uv_stat_t *sb);
+
+/* Portable write() */
+int UvOsWrite(uv_file fd,
+	      const uv_buf_t bufs[],
+	      unsigned int nbufs,
+	      int64_t offset);
+
+/* Portable unlink() */
+int UvOsUnlink(const char *path);
+
+/* Portable rename() */
+int UvOsRename(const char *path1, const char *path2);
+
+/* Join dir and filename into a full OS path. */
+int UvOsJoin(const char *dir, const char *filename, char *path);
+
+/* TODO: figure a portable abstraction. */
+int UvOsIoSetup(unsigned nr, aio_context_t *ctxp);
+int UvOsIoDestroy(aio_context_t ctx);
+int UvOsIoSubmit(aio_context_t ctx, long nr, struct iocb **iocbpp);
+int UvOsIoGetevents(aio_context_t ctx,
+		    long min_nr,
+		    long max_nr,
+		    struct io_event *events,
+		    struct timespec *timeout);
+int UvOsEventfd(unsigned int initval, int flags);
+int UvOsSetDirectIo(uv_file fd);
+
+/* Format an error message caused by a failed system call or stdlib function. */
+#define UvOsErrMsg(ERRMSG, SYSCALL, ERRNUM)                      \
+	{                                                        \
+		ErrMsgPrintf(ERRMSG, "%s", uv_strerror(ERRNUM)); \
+		ErrMsgWrapf(ERRMSG, SYSCALL);                    \
+	}
+
+#endif /* UV_OS_H_ */
diff --git a/src/raft/uv_prepare.c b/src/raft/uv_prepare.c
new file mode 100644
index 000000000..00355480f
--- /dev/null
+++ b/src/raft/uv_prepare.c
@@ -0,0 +1,339 @@
+#include <string.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "heap.h"
+#include "uv.h"
+#include "uv_os.h"
+
+/* The happy path for UvPrepare is:
+ *
+ * - If there is an unused open segment available, return its fd and counter
+ *   immediately.
+ *
+ * - Otherwise, wait for the creation of a new open segment to complete,
+ *   possibly kicking off the creation logic if no segment is being created
+ *   currently.
+ *
+ * Possible failure modes are:
+ *
+ * - The create file request fails, in that case we fail all pending prepare
+ *   requests and we mark the uv instance as errored.
+ *
+ * On close:
+ *
+ * - Cancel all pending prepare requests.
+ * - Remove unused prepared open segments.
+ * - Wait for any pending internal segment creation and then discard the newly
+ *   created segment.
+ */
+
+/* Number of open segments that we try to keep ready for writing. */
+#define UV__TARGET_POOL_SIZE 2
+
+/* An open segment being prepared or sitting in the pool */
+struct uvIdleSegment
+{
+	struct uv *uv;         /* Open segment file */
+	size_t size;           /* Segment size */
+	struct uv_work_s work; /* To execute logic in the threadpool */
+	int status;            /* Result of threadpool callback */
+	char errmsg[RAFT_ERRMSG_BUF_SIZE]; /* Error of threadpool callback */
+	unsigned long long counter;        /* Segment counter */
+	char filename[UV__FILENAME_LEN];   /* Filename of the segment */
+	uv_file fd;  /* File descriptor of prepared file */
+	queue queue; /* Pool */
+};
+
+static void uvPrepareWorkCb(uv_work_t *work)
+{
+	struct uvIdleSegment *segment = work->data;
+	struct uv *uv = segment->uv;
+	int rv;
+
+	rv = UvFsAllocateFile(uv->dir, segment->filename, segment->size,
+			      &segment->fd, uv->fallocate, segment->errmsg);
+	if (rv != 0) {
+		goto err;
+	}
+
+	rv = UvFsSyncDir(uv->dir, segment->errmsg);
+	if (rv != 0) {
+		goto err_after_allocate;
+	}
+
+	segment->status = 0;
+	return;
+
+err_after_allocate:
+	UvOsClose(segment->fd);
+err:
+	assert(rv != 0);
+	segment->status = rv;
+	return;
+}
+
+/* Flush all pending requests, invoking their callbacks with the given
+ * status. */
+static void uvPrepareFinishAllRequests(struct uv *uv, int status)
+{
+	while (!QUEUE_IS_EMPTY(&uv->prepare_reqs)) {
+		queue *head;
+		struct uvPrepare *req;
+		head = QUEUE_HEAD(&uv->prepare_reqs);
+		req = QUEUE_DATA(head, struct uvPrepare, queue);
+		QUEUE_REMOVE(&req->queue);
+		req->cb(req, status);
+	}
+}
+
+/* Pop the oldest prepared segment in the pool and return its fd and counter
+ * through the given pointers. */
+static void uvPrepareConsume(struct uv *uv, uv_file *fd, uvCounter *counter)
+{
+	queue *head;
+	struct uvIdleSegment *segment;
+	/* Pop a segment from the pool. */
+	head = QUEUE_HEAD(&uv->prepare_pool);
+	segment = QUEUE_DATA(head, struct uvIdleSegment, queue);
+	assert(segment->fd >= 0);
+	QUEUE_REMOVE(&segment->queue);
+	*fd = segment->fd;
+	*counter = segment->counter;
+	RaftHeapFree(segment);
+}
+
+/* Finish the oldest pending prepare request using the next available prepared
+ * segment. */
+static void uvPrepareFinishOldestRequest(struct uv *uv)
+{
+	queue *head;
+	struct uvPrepare *req;
+
+	assert(!uv->closing);
+	assert(!QUEUE_IS_EMPTY(&uv->prepare_reqs));
+	assert(!QUEUE_IS_EMPTY(&uv->prepare_pool));
+
+	/* Pop the head of the prepare requests queue. */
+	head = QUEUE_HEAD(&uv->prepare_reqs);
+	req = QUEUE_DATA(head, struct uvPrepare, queue);
+	QUEUE_REMOVE(&req->queue);
+
+	/* Finish the request */
+	uvPrepareConsume(uv, &req->fd, &req->counter);
+	req->cb(req, 0);
+}
+
+/* Return the number of ready prepared open segments in the pool. */
+static unsigned uvPrepareCount(struct uv *uv)
+{
+	queue *head;
+	unsigned n;
+	n = 0;
+	QUEUE_FOREACH(head, &uv->prepare_pool)
+	{
+		n++;
+	}
+	return n;
+}
+
+static void uvPrepareAfterWorkCb(uv_work_t *work, int status);
+
+/* Start creating a new segment file. */
+static int uvPrepareStart(struct uv *uv)
+{
+	struct uvIdleSegment *segment;
+	int rv;
+
+	assert(uv->prepare_inflight == NULL);
+	assert(uvPrepareCount(uv) < UV__TARGET_POOL_SIZE);
+
+	segment = RaftHeapMalloc(sizeof *segment);
+	if (segment == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	memset(segment, 0, sizeof *segment);
+	segment->uv = uv;
+	segment->counter = uv->prepare_next_counter;
+	segment->work.data = segment;
+	segment->fd = -1;
+	segment->size = uv->block_size * uvSegmentBlocks(uv);
+	sprintf(segment->filename, UV__OPEN_TEMPLATE, segment->counter);
+
+	tracef("create open segment %s", segment->filename);
+	rv = uv_queue_work(uv->loop, &segment->work, uvPrepareWorkCb,
+			   uvPrepareAfterWorkCb);
+	if (rv != 0) {
+		/* UNTESTED: with the current libuv implementation this can't
+		 * fail. */
+		tracef("can't create segment %s: %s", segment->filename,
+		       uv_strerror(rv));
+		rv = RAFT_IOERR;
+		goto err_after_segment_alloc;
+	}
+
+	uv->prepare_inflight = segment;
+	uv->prepare_next_counter++;
+
+	return 0;
+
+err_after_segment_alloc:
+	RaftHeapFree(segment);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+static void uvPrepareAfterWorkCb(uv_work_t *work, int status)
+{
+	struct uvIdleSegment *segment = work->data;
+	struct uv *uv = segment->uv;
+	int rv;
+	assert(status == 0);
+
+	uv->prepare_inflight =
+	    NULL; /* Reset the creation in-progress marker. */
+
+	/* If we are closing, let's discard the segment. All pending requests
+	 * have already being fired with RAFT_CANCELED. */
+	if (uv->closing) {
+		assert(QUEUE_IS_EMPTY(&uv->prepare_pool));
+		assert(QUEUE_IS_EMPTY(&uv->prepare_reqs));
+		if (segment->status == 0) {
+			char errmsg[RAFT_ERRMSG_BUF_SIZE];
+			UvOsClose(segment->fd);
+			UvFsRemoveFile(uv->dir, segment->filename, errmsg);
+		}
+		tracef("canceled creation of %s", segment->filename);
+		RaftHeapFree(segment);
+		uvMaybeFireCloseCb(uv);
+		return;
+	}
+
+	/* If the request has failed, mark all pending requests as failed and
+	 * don't try to create any further segment.
+	 *
+	 * Note that if there's no pending request, we don't set the error
+	 * message, to avoid overwriting previous errors. */
+	if (segment->status != 0) {
+		if (!QUEUE_IS_EMPTY(&uv->prepare_reqs)) {
+			ErrMsgTransferf(segment->errmsg, uv->io->errmsg,
+					"create segment %s", segment->filename);
+			uvPrepareFinishAllRequests(uv, segment->status);
+		}
+		uv->errored = true;
+		RaftHeapFree(segment);
+		return;
+	}
+
+	assert(segment->fd >= 0);
+
+	tracef("completed creation of %s", segment->filename);
+	QUEUE_PUSH(&uv->prepare_pool, &segment->queue);
+
+	/* Let's process any pending request. */
+	if (!QUEUE_IS_EMPTY(&uv->prepare_reqs)) {
+		uvPrepareFinishOldestRequest(uv);
+	}
+
+	/* If we are already creating a segment, we're done. */
+	if (uv->prepare_inflight != NULL) {
+		return;
+	}
+
+	/* If we have already enough prepared open segments, we're done. There
+	 * can't be any outstanding prepare requests, since if the request queue
+	 * was not empty, we would have called uvPrepareFinishOldestRequest()
+	 * above, thus reducing the pool size and making it smaller than the
+	 * target size. */
+	if (uvPrepareCount(uv) >= UV__TARGET_POOL_SIZE) {
+		assert(QUEUE_IS_EMPTY(&uv->prepare_reqs));
+		return;
+	}
+
+	/* Let's start preparing a new open segment. */
+	rv = uvPrepareStart(uv);
+	if (rv != 0) {
+		uvPrepareFinishAllRequests(uv, rv);
+		uv->errored = true;
+	}
+}
+
+/* Discard a prepared open segment, closing its file descriptor and removing the
+ * underlying file. */
+static void uvPrepareDiscard(struct uv *uv, uv_file fd, uvCounter counter)
+{
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	char filename[UV__FILENAME_LEN];
+	assert(counter > 0);
+	assert(fd >= 0);
+	sprintf(filename, UV__OPEN_TEMPLATE, counter);
+	UvOsClose(fd);
+	UvFsRemoveFile(uv->dir, filename, errmsg);
+}
+
+int UvPrepare(struct uv *uv,
+	      uv_file *fd,
+	      uvCounter *counter,
+	      struct uvPrepare *req,
+	      uvPrepareCb cb)
+{
+	int rv;
+
+	assert(!uv->closing);
+
+	if (!QUEUE_IS_EMPTY(&uv->prepare_pool)) {
+		uvPrepareConsume(uv, fd, counter);
+		goto maybe_start;
+	}
+
+	*fd = -1;
+	*counter = 0;
+	req->cb = cb;
+	QUEUE_PUSH(&uv->prepare_reqs, &req->queue);
+
+maybe_start:
+	/* If we are already creating a segment, let's just wait. */
+	if (uv->prepare_inflight != NULL) {
+		return 0;
+	}
+
+	rv = uvPrepareStart(uv);
+	if (rv != 0) {
+		goto err;
+	}
+
+	return 0;
+
+err:
+	if (*fd != -1) {
+		uvPrepareDiscard(uv, *fd, *counter);
+	} else {
+		QUEUE_REMOVE(&req->queue);
+	}
+	assert(rv != 0);
+	return rv;
+}
+
+void UvPrepareClose(struct uv *uv)
+{
+	assert(uv->closing);
+
+	/* Cancel all pending prepare requests. */
+	uvPrepareFinishAllRequests(uv, RAFT_CANCELED);
+
+	/* Remove any unused prepared segment. */
+	while (!QUEUE_IS_EMPTY(&uv->prepare_pool)) {
+		queue *head;
+		struct uvIdleSegment *segment;
+		head = QUEUE_HEAD(&uv->prepare_pool);
+		segment = QUEUE_DATA(head, struct uvIdleSegment, queue);
+		QUEUE_REMOVE(&segment->queue);
+		uvPrepareDiscard(uv, segment->fd, segment->counter);
+		RaftHeapFree(segment);
+	}
+}
+
+#undef tracef
diff --git a/src/raft/uv_recv.c b/src/raft/uv_recv.c
new file mode 100644
index 000000000..72d6f5ec6
--- /dev/null
+++ b/src/raft/uv_recv.c
@@ -0,0 +1,423 @@
+#include <string.h>
+
+#include "../raft.h"
+
+#include "assert.h"
+#include "byte.h"
+#include "configuration.h"
+#include "err.h"
+#include "heap.h"
+#include "uv.h"
+#include "uv_encoding.h"
+
+/* The happy path for a receiving an RPC message is:
+ *
+ * - When a peer server successfully establishes a new connection with us, the
+ *   transport invokes our accept callback.
+ *
+ * - A new server object is created and added to the servers array. It starts
+ *   reading from the stream handle of the new connection.
+ *
+ * - The RPC message preamble is read, which contains the message type and the
+ *   message length.
+ *
+ * - The RPC message header is read, whose content depends on the message type.
+ *
+ * - Optionally, the RPC message payload is read (for AppendEntries requests).
+ *
+ * - The recv callback passed to raft_io->start() gets fired with the received
+ *   message.
+ *
+ * Possible failure modes are:
+ *
+ * - The peer server disconnects. In this case the read callback will fire with
+ *   UV_EOF, we'll close the stream handle and then release all memory
+ *   associated with the server object.
+ *
+ * - The peer server sends us invalid data. In this case we close the stream
+ *   handle and act like above.
+ */
+
+struct uvServer
+{
+	struct uv *uv;              /* libuv I/O implementation object */
+	raft_id id;                 /* ID of the remote server */
+	char *address;              /* Address of the other server */
+	struct uv_stream_s *stream; /* Connection handle */
+	uv_buf_t buf;         /* Sliding buffer for reading incoming data */
+	uint64_t preamble[2]; /* Static buffer with the request preamble */
+	uv_buf_t header;      /* Dynamic buffer with the request header */
+	uv_buf_t payload;     /* Dynamic buffer with the request payload */
+	struct raft_message message; /* The message being received */
+	queue queue;                 /* Servers queue */
+};
+
+/* Initialize a new server object for reading requests from an incoming
+ * connection. */
+static int uvServerInit(struct uvServer *s,
+			struct uv *uv,
+			const raft_id id,
+			const char *address,
+			struct uv_stream_s *stream)
+{
+	s->uv = uv;
+	s->id = id;
+	s->address = RaftHeapMalloc(strlen(address) + 1);
+	if (s->address == NULL) {
+		return RAFT_NOMEM;
+	}
+	strcpy(s->address, address);
+	s->stream = stream;
+	s->stream->data = s;
+	s->buf.base = NULL;
+	s->buf.len = 0;
+	s->preamble[0] = 0;
+	s->preamble[1] = 0;
+	s->header.base = NULL;
+	s->header.len = 0;
+	s->message.type = 0;
+	s->payload.base = NULL;
+	s->payload.len = 0;
+	QUEUE_PUSH(&uv->servers, &s->queue);
+	return 0;
+}
+
+static void uvServerDestroy(struct uvServer *s)
+{
+	QUEUE_REMOVE(&s->queue);
+
+	if (s->header.base != NULL) {
+		/* This means we were interrupted while reading the header. */
+		RaftHeapFree(s->header.base);
+		switch (s->message.type) {
+			case RAFT_IO_APPEND_ENTRIES:
+				RaftHeapFree(s->message.append_entries.entries);
+				break;
+			case RAFT_IO_INSTALL_SNAPSHOT:
+				configurationClose(
+				    &s->message.install_snapshot.conf);
+				break;
+		}
+	}
+	if (s->payload.base != NULL) {
+		/* This means we were interrupted while reading the payload. */
+		RaftHeapFree(s->payload.base);
+	}
+	RaftHeapFree(s->address);
+	RaftHeapFree(s->stream);
+}
+
+/* Invoked to initialize the read buffer for the next asynchronous read on the
+ * socket. */
+static void uvServerAllocCb(uv_handle_t *handle,
+			    size_t suggested_size,
+			    uv_buf_t *buf)
+{
+	struct uvServer *s = handle->data;
+	(void)suggested_size;
+
+	assert(!s->uv->closing);
+
+	/* If this is the first read of the preamble, or of the header, or of
+	 * the payload, then initialize the read buffer, according to the chunk
+	 * of data that we expect next. */
+	if (s->buf.len == 0) {
+		assert(s->buf.base == NULL);
+
+		/* Check if we expect the preamble. */
+		if (s->header.len == 0) {
+			assert(s->preamble[0] == 0);
+			assert(s->preamble[1] == 0);
+			s->buf.base = (char *)s->preamble;
+			s->buf.len = sizeof s->preamble;
+			goto out;
+		}
+
+		/* Check if we expect the header. */
+		if (s->payload.len == 0) {
+			assert(s->header.len > 0);
+			assert(s->header.base == NULL);
+			s->header.base = RaftHeapMalloc(s->header.len);
+			if (s->header.base == NULL) {
+				/* Setting all buffer fields to 0 will make
+				 * read_cb fail with ENOBUFS. */
+				memset(buf, 0, sizeof *buf);
+				return;
+			}
+			s->buf = s->header;
+			goto out;
+		}
+
+		/* If we get here we should be expecting the payload. */
+		assert(s->payload.len > 0);
+		s->payload.base = RaftHeapMalloc(s->payload.len);
+		if (s->payload.base == NULL) {
+			/* Setting all buffer fields to 0 will make read_cb fail
+			 * with ENOBUFS. */
+			memset(buf, 0, sizeof *buf);
+			return;
+		}
+
+		s->buf = s->payload;
+	}
+
+out:
+	*buf = s->buf;
+}
+
+/* Callback invoked afer the stream handle of this server connection has been
+ * closed. We can release all resources associated with the server object. */
+static void uvServerStreamCloseCb(uv_handle_t *handle)
+{
+	struct uvServer *s = handle->data;
+	struct uv *uv = s->uv;
+	uvServerDestroy(s);
+	RaftHeapFree(s);
+	uvMaybeFireCloseCb(uv);
+}
+
+static void uvServerAbort(struct uvServer *s)
+{
+	struct uv *uv = s->uv;
+	QUEUE_REMOVE(&s->queue);
+	QUEUE_PUSH(&uv->aborting, &s->queue);
+	uv_close((struct uv_handle_s *)s->stream, uvServerStreamCloseCb);
+}
+
+/* Invoke the receive callback. */
+static void uvFireRecvCb(struct uvServer *s)
+{
+	s->uv->recv_cb(s->uv->io, &s->message);
+
+	/* Reset our state as we'll start reading a new message. We don't need
+	 * to release the payload buffer, since ownership was transferred to the
+	 * user. */
+	memset(s->preamble, 0, sizeof s->preamble);
+	raft_free(s->header.base);
+	s->message.type = 0;
+	s->header.base = NULL;
+	s->header.len = 0;
+	s->payload.base = NULL;
+	s->payload.len = 0;
+}
+
+/* Callback invoked when data has been read from the socket. */
+static void uvServerReadCb(uv_stream_t *stream,
+			   ssize_t nread,
+			   const uv_buf_t *buf)
+{
+	struct uvServer *s = stream->data;
+	int rv;
+
+	(void)buf;
+
+	assert(!s->uv->closing);
+
+	/* If the read was successful, let's check if we have received all the
+	 * data we expected. */
+	if (nread > 0) {
+		size_t n = (size_t)nread;
+
+		/* We shouldn't have read more data than the pending amount. */
+		assert(n <= s->buf.len);
+
+		/* Advance the read window */
+		s->buf.base += n;
+		s->buf.len -= n;
+
+		/* If there's more data to read in order to fill the current
+		 * read buffer, just return, we'll be invoked again. */
+		if (s->buf.len > 0) {
+			return;
+		}
+
+		if (s->header.len == 0) {
+			/* If the header buffer is not set, it means that we've
+			 * just completed reading the preamble. */
+			assert(s->header.base == NULL);
+
+			s->header.len = (size_t)byteFlip64(s->preamble[1]);
+
+			/* The length of the header must be greater than zero.
+			 */
+			if (s->header.len == 0) {
+				tracef("message has zero length");
+				goto abort;
+			}
+		} else if (s->payload.len == 0) {
+			/* If the payload buffer is not set, it means we just
+			 * completed reading the message header. */
+			uint64_t type;
+
+			assert(s->header.base != NULL);
+
+			type = byteFlip64(s->preamble[0]);
+
+			/* Only use first 2 bytes of the type. Normally we would
+			 * check if type doesn't overflow UINT16_MAX, but we
+			 * don't do this to allow future legacy nodes to still
+			 * handle messages that include extra information in the
+			 * 6 unused bytes of the type field of the preamble.
+			 * TODO: This is preparation to add the version of the
+			 * message in the raft preamble. Once this change has
+			 * been active for sufficiently long time, we can start
+			 * encoding the version in some of the remaining bytes
+			 * of s->preamble[0]. */
+			rv = uvDecodeMessage((uint16_t)type, &s->header,
+					     &s->message, &s->payload.len);
+			if (rv != 0) {
+				tracef("decode message: %s",
+				       errCodeToString(rv));
+				goto abort;
+			}
+
+			s->message.server_id = s->id;
+			s->message.server_address = s->address;
+
+			/* If the message has no payload, we're done. */
+			if (s->payload.len == 0) {
+				uvFireRecvCb(s);
+			}
+		} else {
+			/* If we get here it means that we've just completed
+			 * reading the payload. TODO: avoid converting from
+			 * uv_buf_t */
+			struct raft_buffer payload;
+			assert(s->payload.base != NULL);
+			assert(s->payload.len > 0);
+
+			switch (s->message.type) {
+				case RAFT_IO_APPEND_ENTRIES:
+					payload.base = s->payload.base;
+					payload.len = s->payload.len;
+					uvDecodeEntriesBatch(
+					    payload.base, 0,
+					    s->message.append_entries.entries,
+					    s->message.append_entries
+						.n_entries);
+					break;
+				case RAFT_IO_INSTALL_SNAPSHOT:
+					s->message.install_snapshot.data.base =
+					    s->payload.base;
+					break;
+				default:
+					/* We should never have read a payload
+					 * in the first place */
+					assert(0);
+			}
+
+			uvFireRecvCb(s);
+		}
+
+		/* Mark that we're done with this chunk. When the alloc callback
+		 * will trigger again it will notice that it needs to change the
+		 * read buffer. */
+		assert(s->buf.len == 0);
+		s->buf.base = NULL;
+
+		return;
+	}
+
+	/* The if nread>0 condition above should always exit the function with a
+	 * goto abort or a return. */
+	assert(nread <= 0);
+
+	if (nread == 0) {
+		/* Empty read */
+		return;
+	}
+	if (nread != UV_EOF) {
+		tracef("receive data: %s", uv_strerror((int)nread));
+	}
+
+abort:
+	uvServerAbort(s);
+}
+
+/* Start reading incoming requests. */
+static int uvServerStart(struct uvServer *s)
+{
+	int rv;
+	rv = uv_read_start(s->stream, uvServerAllocCb, uvServerReadCb);
+	if (rv != 0) {
+		tracef("start reading: %s", uv_strerror(rv));
+		return RAFT_IOERR;
+	}
+	return 0;
+}
+
+static int uvAddServer(struct uv *uv,
+		       raft_id id,
+		       const char *address,
+		       struct uv_stream_s *stream)
+{
+	struct uvServer *server;
+	int rv;
+
+	/* Initialize the new connection */
+	server = RaftHeapMalloc(sizeof *server);
+	if (server == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	rv = uvServerInit(server, uv, id, address, stream);
+	if (rv != 0) {
+		goto err_after_server_alloc;
+	}
+
+	/* This will start reading requests. */
+	rv = uvServerStart(server);
+	if (rv != 0) {
+		goto err_after_init_server;
+	}
+
+	return 0;
+
+err_after_init_server:
+	uvServerDestroy(server);
+err_after_server_alloc:
+	raft_free(server);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+static void uvRecvAcceptCb(struct raft_uv_transport *transport,
+			   raft_id id,
+			   const char *address,
+			   struct uv_stream_s *stream)
+{
+	struct uv *uv = transport->data;
+	int rv;
+	assert(!uv->closing);
+	rv = uvAddServer(uv, id, address, stream);
+	if (rv != 0) {
+		tracef("add server: %s", errCodeToString(rv));
+		uv_close((struct uv_handle_s *)stream,
+			 (uv_close_cb)RaftHeapFree);
+	}
+}
+
+int UvRecvStart(struct uv *uv)
+{
+	int rv;
+	rv = uv->transport->listen(uv->transport, uvRecvAcceptCb);
+	if (rv != 0) {
+		return rv;
+	}
+	return 0;
+}
+
+void UvRecvClose(struct uv *uv)
+{
+	while (!QUEUE_IS_EMPTY(&uv->servers)) {
+		queue *head;
+		struct uvServer *server;
+		head = QUEUE_HEAD(&uv->servers);
+		server = QUEUE_DATA(head, struct uvServer, queue);
+		uvServerAbort(server);
+	}
+}
+
+#undef tracef
diff --git a/src/raft/uv_segment.c b/src/raft/uv_segment.c
new file mode 100644
index 000000000..ca178238d
--- /dev/null
+++ b/src/raft/uv_segment.c
@@ -0,0 +1,1158 @@
+#include <errno.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "array.h"
+#include "assert.h"
+#include "byte.h"
+#include "configuration.h"
+#include "entry.h"
+#include "heap.h"
+#include "uv.h"
+#include "uv_encoding.h"
+
+/* Check if the given filename matches the one of a closed segment (xxx-yyy), or
+ * of an open segment (open-xxx), and fill the given info structure if so.
+ *
+ * Return true if the filename matched, false otherwise. */
+static bool uvSegmentInfoMatch(const char *filename, struct uvSegmentInfo *info)
+{
+	int consumed;
+	int matched;
+	size_t n;
+	size_t filename_len = strnlen(filename, UV__FILENAME_LEN + 1);
+
+	assert(filename_len < UV__FILENAME_LEN);
+
+	matched = sscanf(filename, UV__CLOSED_TEMPLATE "%n", &info->first_index,
+			 &info->end_index, &consumed);
+	if (matched == 2 && consumed == (int)filename_len) {
+		info->is_open = false;
+		goto match;
+	}
+
+	matched =
+	    sscanf(filename, UV__OPEN_TEMPLATE "%n", &info->counter, &consumed);
+	if (matched == 1 && consumed == (int)filename_len) {
+		info->is_open = true;
+		goto match;
+	}
+
+	return false;
+
+match:
+	n = sizeof(info->filename) - 1;
+	strncpy(info->filename, filename, n);
+	info->filename[n] = '\0';
+	return true;
+}
+
+int uvSegmentInfoAppendIfMatch(const char *filename,
+			       struct uvSegmentInfo *infos[],
+			       size_t *n_infos,
+			       bool *appended)
+{
+	struct uvSegmentInfo info;
+	bool matched;
+	int rv;
+
+	/* Check if it's a closed or open filename */
+	matched = uvSegmentInfoMatch(filename, &info);
+
+	/* If this is neither a closed or an open segment, return. */
+	if (!matched) {
+		*appended = false;
+		return 0;
+	}
+
+	ARRAY__APPEND(struct uvSegmentInfo, info, infos, n_infos, rv);
+	if (rv == -1) {
+		return RAFT_NOMEM;
+	}
+
+	*appended = true;
+
+	return 0;
+}
+
+/* Compare two segments to decide which one is more recent. */
+static int uvSegmentInfoCompare(const void *p1, const void *p2)
+{
+	struct uvSegmentInfo *s1 = (struct uvSegmentInfo *)p1;
+	struct uvSegmentInfo *s2 = (struct uvSegmentInfo *)p2;
+
+	/* Closed segments are less recent than open segments. */
+	if (s1->is_open && !s2->is_open) {
+		return 1;
+	}
+	if (!s1->is_open && s2->is_open) {
+		return -1;
+	}
+
+	/* If the segments are open, compare the counter. */
+	if (s1->is_open) {
+		assert(s2->is_open);
+		assert(s1->counter != s2->counter);
+		return s1->counter < s2->counter ? -1 : 1;
+	}
+
+	/* If the segments are closed, compare the first index. The index ranges
+	 * must be disjoint. */
+	if (s2->first_index > s1->end_index) {
+		return -1;
+	}
+
+	return 1;
+}
+
+void uvSegmentSort(struct uvSegmentInfo *infos, size_t n_infos)
+{
+	qsort(infos, n_infos, sizeof *infos, uvSegmentInfoCompare);
+}
+
+int uvSegmentKeepTrailing(struct uv *uv,
+			  struct uvSegmentInfo *segments,
+			  size_t n,
+			  raft_index last_index,
+			  size_t trailing,
+			  char *errmsg)
+{
+	raft_index retain_index;
+	size_t i;
+	int rv;
+
+	assert(last_index > 0);
+	assert(n > 0);
+
+	if (last_index <= trailing) {
+		return 0;
+	}
+
+	/* Index of the oldest entry we want to retain. */
+	retain_index = last_index - trailing + 1;
+
+	for (i = 0; i < n; i++) {
+		struct uvSegmentInfo *segment = &segments[i];
+		if (segment->is_open) {
+			break;
+		}
+		if (trailing == 0 || segment->end_index < retain_index) {
+			rv = UvFsRemoveFile(uv->dir, segment->filename, errmsg);
+			if (rv != 0) {
+				ErrMsgWrapf(errmsg, "delete closed segment %s",
+					    segment->filename);
+				return rv;
+			}
+		} else {
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/* Read a segment file and return its format version. */
+static int uvReadSegmentFile(struct uv *uv,
+			     const char *filename,
+			     struct raft_buffer *buf,
+			     uint64_t *format)
+{
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int rv;
+	rv = UvFsReadFile(uv->dir, filename, buf, errmsg);
+	if (rv != 0) {
+		ErrMsgTransfer(errmsg, uv->io->errmsg, "read file");
+		return RAFT_IOERR;
+	}
+	if (buf->len < 8) {
+		ErrMsgPrintf(uv->io->errmsg, "file has only %zu bytes",
+			     buf->len);
+		RaftHeapFree(buf->base);
+		return RAFT_IOERR;
+	}
+	*format = byteFlip64(*(uint64_t *)buf->base);
+	return 0;
+}
+
+/* Consume the content buffer, returning a pointer to the current position and
+ * advancing the offset of n bytes. Return an error if not enough bytes are
+ * available. */
+static int uvConsumeContent(const struct raft_buffer *content,
+			    size_t *offset,
+			    size_t n,
+			    void **data,
+			    char *errmsg)
+{
+	if (*offset + n > content->len) {
+		size_t remaining = content->len - *offset;
+		ErrMsgPrintf(errmsg, "short read: %zu bytes instead of %zu",
+			     remaining, n);
+		return RAFT_IOERR;
+	}
+	if (data != NULL) {
+		*data = &((uint8_t *)content->base)[*offset];
+	}
+	*offset += n;
+	return 0;
+}
+
+/* Load a single batch of entries from a segment.
+ *
+ * Set @last to #true if the loaded batch is the last one. */
+static int uvLoadEntriesBatch(struct uv *uv,
+			      const struct raft_buffer *content,
+			      struct raft_entry **entries,
+			      unsigned *n_entries,
+			      size_t *offset, /* Offset of last batch */
+			      bool *last)
+{
+	void *checksums;           /* CRC32 checksums */
+	void *batch;               /* Entries batch */
+	unsigned long n;           /* Number of entries in the batch */
+	unsigned max_n;            /* Maximum number of entries we expect */
+	unsigned i;                /* Iterate through the entries */
+	struct raft_buffer header; /* Batch header */
+	struct raft_buffer data;   /* Batch data */
+	uint32_t crc1;             /* Target checksum */
+	uint32_t crc2;             /* Actual checksum */
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	size_t start;
+	int rv;
+
+	/* Save the current offset, to provide more information when logging. */
+	start = *offset;
+
+	/* Read the checksums. */
+	rv = uvConsumeContent(content, offset, sizeof(uint32_t) * 2, &checksums,
+			      errmsg);
+	if (rv != 0) {
+		ErrMsgTransfer(errmsg, uv->io->errmsg, "read preamble");
+		return RAFT_IOERR;
+	}
+
+	/* Read the first 8 bytes of the batch, which contains the number of
+	 * entries in the batch. */
+	rv =
+	    uvConsumeContent(content, offset, sizeof(uint64_t), &batch, errmsg);
+	if (rv != 0) {
+		ErrMsgTransfer(errmsg, uv->io->errmsg, "read preamble");
+		return RAFT_IOERR;
+	}
+
+	n = (size_t)byteFlip64(*(uint64_t *)batch);
+	if (n == 0) {
+		ErrMsgPrintf(uv->io->errmsg,
+			     "entries count in preamble is zero");
+		rv = RAFT_CORRUPT;
+		goto err;
+	}
+
+	/* Very optimistic upper bound of the number of entries we should
+	 * expect. This is mainly a protection against allocating too much
+	 * memory. Each entry will consume at least 4 words (for term, type,
+	 * size and payload). */
+	max_n = UV__MAX_SEGMENT_SIZE / (sizeof(uint64_t) * 4);
+
+	if (n > max_n) {
+		ErrMsgPrintf(uv->io->errmsg,
+			     "entries count %lu in preamble is too high", n);
+		rv = RAFT_CORRUPT;
+		goto err;
+	}
+
+	/* Consume the batch header, excluding the first 8 bytes containing the
+	 * number of entries, which we have already read. */
+	header.len = uvSizeofBatchHeader(n);
+	header.base = batch;
+
+	rv = uvConsumeContent(content, offset,
+			      uvSizeofBatchHeader(n) - sizeof(uint64_t), NULL,
+			      errmsg);
+	if (rv != 0) {
+		ErrMsgTransfer(errmsg, uv->io->errmsg, "read header");
+		rv = RAFT_IOERR;
+		goto err;
+	}
+
+	/* Check batch header integrity. */
+	crc1 = byteFlip32(((uint32_t *)checksums)[0]);
+	crc2 = byteCrc32(header.base, header.len, 0);
+	if (crc1 != crc2) {
+		ErrMsgPrintf(uv->io->errmsg, "header checksum mismatch");
+		rv = RAFT_CORRUPT;
+		goto err;
+	}
+
+	/* Decode the batch header, allocating the entries array. */
+	rv = uvDecodeBatchHeader(header.base, entries, n_entries);
+	if (rv != 0) {
+		goto err;
+	}
+
+	/* Calculate the total size of the batch data */
+	data.len = 0;
+	for (i = 0; i < n; i++) {
+		data.len += (*entries)[i].buf.len;
+	}
+	data.base = (uint8_t *)content->base + *offset;
+
+	/* Consume the batch data */
+	rv = uvConsumeContent(content, offset, data.len, NULL, errmsg);
+	if (rv != 0) {
+		ErrMsgTransfer(errmsg, uv->io->errmsg, "read data");
+		rv = RAFT_IOERR;
+		goto err_after_header_decode;
+	}
+
+	/* Check batch data integrity. */
+	crc1 = byteFlip32(((uint32_t *)checksums)[1]);
+	crc2 = byteCrc32(data.base, data.len, 0);
+	if (crc1 != crc2) {
+		ErrMsgPrintf(uv->io->errmsg, "data checksum mismatch");
+		rv = RAFT_CORRUPT;
+		goto err_after_header_decode;
+	}
+
+	uvDecodeEntriesBatch(content->base, *offset - data.len, *entries,
+			     *n_entries);
+
+	*last = *offset == content->len;
+
+	return 0;
+
+err_after_header_decode:
+	RaftHeapFree(*entries);
+err:
+	*entries = NULL;
+	*n_entries = 0;
+	assert(rv != 0);
+	*offset = start;
+	return rv;
+}
+
+/* Append to @entries2 all entries in @entries1. */
+static int extendEntries(const struct raft_entry *entries1,
+			 const size_t n_entries1,
+			 struct raft_entry **entries2,
+			 size_t *n_entries2)
+{
+	struct raft_entry *entries; /* To re-allocate the given entries */
+	size_t i;
+
+	entries = raft_realloc(*entries2,
+			       (*n_entries2 + n_entries1) * sizeof *entries);
+	if (entries == NULL) {
+		return RAFT_NOMEM;
+	}
+
+	for (i = 0; i < n_entries1; i++) {
+		entries[*n_entries2 + i] = entries1[i];
+	}
+
+	*entries2 = entries;
+	*n_entries2 += n_entries1;
+
+	return 0;
+}
+
+int uvSegmentLoadClosed(struct uv *uv,
+			struct uvSegmentInfo *info,
+			struct raft_entry *entries[],
+			size_t *n)
+{
+	bool empty;                     /* Whether the file is empty */
+	uint64_t format;                /* Format version */
+	bool last;                      /* Whether the last batch was reached */
+	struct raft_entry *tmp_entries; /* Entries in current batch */
+	struct raft_buffer buf;         /* Segment file content */
+	size_t offset;                  /* Content read cursor */
+	unsigned tmp_n;                 /* Number of entries in current batch */
+	unsigned expected_n; /* Number of entries that we expect to find */
+	int i;
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int rv;
+
+	expected_n = (unsigned)(info->end_index - info->first_index + 1);
+
+	/* If the segment is completely empty, just bail out. */
+	rv = UvFsFileIsEmpty(uv->dir, info->filename, &empty, errmsg);
+	if (rv != 0) {
+		tracef("stat %s: %s", info->filename, errmsg);
+		rv = RAFT_IOERR;
+		goto err;
+	}
+	if (empty) {
+		ErrMsgPrintf(uv->io->errmsg, "file is empty");
+		rv = RAFT_CORRUPT;
+		goto err;
+	}
+
+	/* Open the segment file. */
+	rv = uvReadSegmentFile(uv, info->filename, &buf, &format);
+	if (rv != 0) {
+		goto err;
+	}
+	if (format != UV__DISK_FORMAT) {
+		ErrMsgPrintf(uv->io->errmsg, "unexpected format version %ju",
+			     format);
+		rv = RAFT_CORRUPT;
+		goto err_after_read;
+	}
+
+	/* Load all batches in the segment. */
+	*entries = NULL;
+	*n = 0;
+
+	last = false;
+	offset = sizeof format;
+	for (i = 1; !last; i++) {
+		rv = uvLoadEntriesBatch(uv, &buf, &tmp_entries, &tmp_n, &offset,
+					&last);
+		if (rv != 0) {
+			ErrMsgWrapf(uv->io->errmsg,
+				    "entries batch %u starting at byte %zu", i,
+				    offset);
+			/* Clean up the last allocation from extendEntries. */
+			goto err_after_extend_entries;
+		}
+		rv = extendEntries(tmp_entries, tmp_n, entries, n);
+		if (rv != 0) {
+			goto err_after_batch_load;
+		}
+		raft_free(tmp_entries);
+	}
+
+	if (*n != expected_n) {
+		ErrMsgPrintf(uv->io->errmsg, "found %zu entries (expected %u)",
+			     *n, expected_n);
+		rv = RAFT_CORRUPT;
+		goto err_after_extend_entries;
+	}
+
+	assert(i > 1);  /* At least one batch was loaded. */
+	assert(*n > 0); /* At least one entry was loaded. */
+
+	return 0;
+
+err_after_batch_load:
+	raft_free(tmp_entries[0].batch);
+	raft_free(tmp_entries);
+
+err_after_extend_entries:
+	if (*entries != NULL) {
+		RaftHeapFree(*entries);
+	}
+
+err_after_read:
+	RaftHeapFree(buf.base);
+
+err:
+	assert(rv != 0);
+
+	return rv;
+}
+
+/* Check if the content of the segment file contains all zeros from the current
+ * offset onward. */
+static bool uvContentHasOnlyTrailingZeros(const struct raft_buffer *buf,
+					  size_t offset)
+{
+	size_t i;
+
+	for (i = offset; i < buf->len; i++) {
+		if (((char *)buf->base)[i] != 0) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/* Load all entries contained in an open segment. */
+static int uvSegmentLoadOpen(struct uv *uv,
+			     struct uvSegmentInfo *info,
+			     struct raft_entry *entries[],
+			     size_t *n,
+			     raft_index *next_index)
+{
+	raft_index first_index;         /* Index of first entry in segment */
+	bool all_zeros;                 /* Whether the file is zero'ed */
+	bool empty;                     /* Whether the segment file is empty */
+	bool remove = false;            /* Whether to remove this segment */
+	bool last = false;              /* Whether the last batch was reached */
+	uint64_t format;                /* Format version */
+	size_t n_batches = 0;           /* Number of loaded batches */
+	struct raft_entry *tmp_entries; /* Entries in current batch */
+	struct raft_buffer buf = {0};   /* Segment file content */
+	size_t offset;                  /* Content read cursor */
+	unsigned tmp_n_entries;         /* Number of entries in current batch */
+	int i;
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int rv;
+
+	first_index = *next_index;
+
+	rv = UvFsFileIsEmpty(uv->dir, info->filename, &empty, errmsg);
+	if (rv != 0) {
+		tracef("check if %s is empty: %s", info->filename, errmsg);
+		rv = RAFT_IOERR;
+		goto err;
+	}
+
+	if (empty) {
+		/* Empty segment, let's discard it. */
+		tracef("remove empty open segment %s", info->filename);
+		remove = true;
+		goto done;
+	}
+
+	rv = uvReadSegmentFile(uv, info->filename, &buf, &format);
+	if (rv != 0) {
+		goto err;
+	}
+
+	/* Check that the format is the expected one, or perhaps 0, indicating
+	 * that the segment was allocated but never written. */
+	offset = sizeof format;
+	if (format != UV__DISK_FORMAT) {
+		if (format == 0) {
+			all_zeros = uvContentHasOnlyTrailingZeros(&buf, offset);
+			if (all_zeros) {
+				/* This is equivalent to the empty case, let's
+				 * remove the segment. */
+				tracef("remove zeroed open segment %s",
+				       info->filename);
+				remove = true;
+				RaftHeapFree(buf.base);
+				buf.base = NULL;
+				goto done;
+			}
+		}
+		ErrMsgPrintf(uv->io->errmsg, "unexpected format version %ju",
+			     format);
+		rv = RAFT_CORRUPT;
+		goto err_after_read;
+	}
+
+	/* Load all batches in the segment. */
+	for (i = 1; !last; i++) {
+		rv = uvLoadEntriesBatch(uv, &buf, &tmp_entries, &tmp_n_entries,
+					&offset, &last);
+		if (rv != 0) {
+			/* If this isn't a decoding error, just bail out. */
+			if (rv != RAFT_CORRUPT) {
+				ErrMsgWrapf(
+				    uv->io->errmsg,
+				    "entries batch %u starting at byte %zu", i,
+				    offset);
+				goto err_after_read;
+			}
+
+			/* If this is a decoding error, and not an OS error,
+			 * check if the rest of the file is filled with zeros.
+			 * In that case we assume that the server shutdown
+			 * uncleanly and we just truncate this incomplete data.
+			 */
+			all_zeros = uvContentHasOnlyTrailingZeros(&buf, offset);
+			if (!all_zeros) {
+				tracef("%s has non-zero trail", info->filename);
+			}
+
+			tracef(
+			    "truncate open segment %s at %zu (batch %d), since "
+			    "it has "
+			    "corrupted "
+			    "entries",
+			    info->filename, offset, i);
+
+			break;
+		}
+
+		rv = extendEntries(tmp_entries, tmp_n_entries, entries, n);
+		if (rv != 0) {
+			goto err_after_batch_load;
+		}
+
+		raft_free(tmp_entries);
+
+		n_batches++;
+		*next_index += tmp_n_entries;
+	}
+
+	if (n_batches == 0) {
+		RaftHeapFree(buf.base);
+		buf.base = NULL;
+		remove = true;
+	}
+
+done:
+	/* If the segment has no valid entries in it, we remove it. Otherwise we
+	 * rename it and keep it. */
+	if (remove) {
+		rv = UvFsRemoveFile(uv->dir, info->filename, errmsg);
+		if (rv != 0) {
+			tracef("unlink %s: %s", info->filename, errmsg);
+			rv = RAFT_IOERR;
+			goto err_after_read;
+		}
+	} else {
+		char filename[UV__SEGMENT_FILENAME_BUF_SIZE];
+		raft_index end_index = *next_index - 1;
+
+		/* At least one entry was loaded */
+		assert(end_index >= first_index);
+		int nb = snprintf(filename, sizeof(filename),
+				  UV__CLOSED_TEMPLATE, first_index, end_index);
+		if ((nb < 0) || ((size_t)nb >= sizeof(filename))) {
+			tracef("snprintf failed: %d", nb);
+			rv = RAFT_IOERR;
+			goto err;
+		}
+
+		tracef("finalize %s into %s", info->filename, filename);
+
+		rv = UvFsTruncateAndRenameFile(
+		    uv->dir, (size_t)offset, info->filename, filename, errmsg);
+		if (rv != 0) {
+			tracef("finalize %s: %s", info->filename, errmsg);
+			rv = RAFT_IOERR;
+			goto err;
+		}
+
+		info->is_open = false;
+		info->first_index = first_index;
+		info->end_index = end_index;
+		memset(info->filename, '\0', sizeof(info->filename));
+		_Static_assert(sizeof(info->filename) >= sizeof(filename),
+			       "Destination buffer too small");
+		/* info->filename is zeroed out, info->filename is at least as
+		 * large as filename and we checked that nb < sizeof(filename)
+		 * -> we won't overflow and the result will be zero terminated.
+		 */
+		memcpy(info->filename, filename, (size_t)nb);
+	}
+
+	return 0;
+
+err_after_batch_load:
+	raft_free(tmp_entries[0].batch);
+	raft_free(tmp_entries);
+
+err_after_read:
+	if (buf.base != NULL) {
+		RaftHeapFree(buf.base);
+	}
+
+err:
+	assert(rv != 0);
+
+	return rv;
+}
+
+/* Ensure that the write buffer of the given segment is large enough to hold the
+ * the given number of bytes size. */
+static int uvEnsureSegmentBufferIsLargeEnough(struct uvSegmentBuffer *b,
+					      size_t size)
+{
+	unsigned n = (unsigned)(size / b->block_size);
+	void *base;
+	size_t len;
+
+	if (b->arena.len >= size) {
+		assert(b->arena.base != NULL);
+		return 0;
+	}
+
+	if (size % b->block_size != 0) {
+		n++;
+	}
+
+	len = b->block_size * n;
+	base = raft_aligned_alloc(b->block_size, len);
+	if (base == NULL) {
+		return RAFT_NOMEM;
+	}
+	memset(base, 0, len);
+
+	/* If the current arena is initialized, we need to copy its content,
+	 * since it might have data that we want to retain in the next write. */
+	if (b->arena.base != NULL) {
+		assert(b->arena.len >= b->block_size);
+		memcpy(base, b->arena.base, b->arena.len);
+		raft_aligned_free(b->block_size, b->arena.base);
+	}
+
+	b->arena.base = base;
+	b->arena.len = len;
+
+	return 0;
+}
+
+void uvSegmentBufferInit(struct uvSegmentBuffer *b, size_t block_size)
+{
+	b->block_size = block_size;
+	b->arena.base = NULL;
+	b->arena.len = 0;
+	b->n = 0;
+}
+
+void uvSegmentBufferClose(struct uvSegmentBuffer *b)
+{
+	if (b->arena.base != NULL) {
+		raft_aligned_free(b->block_size, b->arena.base);
+	}
+}
+
+int uvSegmentBufferFormat(struct uvSegmentBuffer *b)
+{
+	int rv;
+	void *cursor;
+	size_t n;
+	assert(b->n == 0);
+	n = sizeof(uint64_t);
+	rv = uvEnsureSegmentBufferIsLargeEnough(b, n);
+	if (rv != 0) {
+		return rv;
+	}
+	b->n = n;
+	cursor = b->arena.base;
+	bytePut64(&cursor, UV__DISK_FORMAT);
+	return 0;
+}
+
+int uvSegmentBufferAppend(struct uvSegmentBuffer *b,
+			  const struct raft_entry entries[],
+			  unsigned n_entries)
+{
+	size_t size;   /* Total size of the batch */
+	uint32_t crc1; /* Header checksum */
+	uint32_t crc2; /* Data checksum */
+	void *crc1_p;  /* Pointer to header checksum slot */
+	void *crc2_p;  /* Pointer to data checksum slot */
+	void *header;  /* Pointer to the header section */
+	void *cursor;
+	unsigned i;
+	int rv;
+
+	size = sizeof(uint32_t) * 2;            /* CRC checksums */
+	size += uvSizeofBatchHeader(n_entries); /* Batch header */
+	for (i = 0; i < n_entries; i++) {       /* Entries data */
+		size += bytePad64(entries[i].buf.len);
+	}
+
+	rv = uvEnsureSegmentBufferIsLargeEnough(b, b->n + size);
+	if (rv != 0) {
+		return rv;
+	}
+	cursor = b->arena.base + b->n;
+
+	/* Placeholder of the checksums */
+	crc1_p = cursor;
+	bytePut32(&cursor, 0);
+	crc2_p = cursor;
+	bytePut32(&cursor, 0);
+
+	/* Batch header */
+	header = cursor;
+	uvEncodeBatchHeader(entries, n_entries, cursor);
+	crc1 = byteCrc32(header, uvSizeofBatchHeader(n_entries), 0);
+	cursor = (uint8_t *)cursor + uvSizeofBatchHeader(n_entries);
+
+	/* Batch data */
+	crc2 = 0;
+	for (i = 0; i < n_entries; i++) {
+		const struct raft_entry *entry = &entries[i];
+		assert(entry->buf.len % sizeof(uint64_t) == 0);
+		memcpy(cursor, entry->buf.base, entry->buf.len);
+		crc2 = byteCrc32(cursor, entry->buf.len, crc2);
+		cursor = (uint8_t *)cursor + entry->buf.len;
+	}
+
+	bytePut32(&crc1_p, crc1);
+	bytePut32(&crc2_p, crc2);
+	b->n += size;
+
+	return 0;
+}
+
+void uvSegmentBufferFinalize(struct uvSegmentBuffer *b, uv_buf_t *out)
+{
+	unsigned n_blocks;
+	unsigned tail;
+
+	n_blocks = (unsigned)(b->n / b->block_size);
+	if (b->n % b->block_size != 0) {
+		n_blocks++;
+	}
+
+	/* Set the remainder of the last block to 0 */
+	tail = (unsigned)(b->n % b->block_size);
+	if (tail != 0) {
+		memset(b->arena.base + b->n, 0, b->block_size - tail);
+	}
+
+	out->base = b->arena.base;
+	out->len = n_blocks * b->block_size;
+}
+
+void uvSegmentBufferReset(struct uvSegmentBuffer *b, unsigned retain)
+{
+	assert(b->n > 0);
+	assert(b->arena.base != NULL);
+
+	if (retain == 0) {
+		b->n = 0;
+		memset(b->arena.base, 0, b->block_size);
+		return;
+	}
+
+	memcpy(b->arena.base, b->arena.base + retain * b->block_size,
+	       b->block_size);
+	b->n = b->n % b->block_size;
+}
+
+/* When a corrupted segment is detected, the segment is renamed.
+ * Upon a restart, raft will not detect the segment anymore and will try
+ * to start without it.
+ * */
+#define CORRUPT_FILE_FMT "corrupt-%" PRId64 "-%s"
+static void uvMoveCorruptSegment(struct uv *uv, struct uvSegmentInfo *info)
+{
+	char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0};
+	char new_filename[UV__FILENAME_LEN + 1] = {0};
+	size_t sz = sizeof(new_filename);
+	int rv;
+
+	struct timespec ts = {0};
+	/* Ignore errors */
+	clock_gettime(CLOCK_REALTIME, &ts);
+	int64_t ns = ts.tv_sec * 1000000000 + ts.tv_nsec;
+	rv = snprintf(new_filename, sz, CORRUPT_FILE_FMT, ns, info->filename);
+	if (rv < 0 || rv >= (int)sz) {
+		tracef("snprintf %d", rv);
+		return;
+	}
+
+	UvFsRenameFile(uv->dir, info->filename, new_filename, errmsg);
+	if (rv != 0) {
+		tracef("%s", errmsg);
+		return;
+	}
+}
+
+/*
+ * On startup, raft will try to recover when a corrupt segment is detected.
+ *
+ * When a corrupt open segment is encountered, it, and all subsequent open
+ * segments, are renamed. Not renaming newer, possible non-corrupt, open
+ * segments could lead to loading inconsistent data.
+ *
+ * When a corrupt closed segment is encountered, it will be renamed when
+ * it is the last closed segment, in that case all open-segments are renamed
+ * too.
+ */
+static void uvRecoverFromCorruptSegment(struct uv *uv,
+					size_t i_corrupt,
+					struct uvSegmentInfo *infos,
+					size_t n_infos)
+{
+	struct uvSegmentInfo *info = &infos[i_corrupt];
+	if (info->is_open) {
+		for (size_t i = i_corrupt; i < n_infos; ++i) {
+			info = &infos[i];
+			uvMoveCorruptSegment(uv, info);
+		}
+	} else {
+		size_t i_next = i_corrupt + 1;
+		/* last segment or last closed segment. */
+		if (i_next == n_infos || infos[i_next].is_open) {
+			for (size_t i = i_corrupt; i < n_infos; ++i) {
+				info = &infos[i];
+				uvMoveCorruptSegment(uv, info);
+			}
+		}
+	}
+}
+
+int uvSegmentLoadAll(struct uv *uv,
+		     const raft_index start_index,
+		     struct uvSegmentInfo *infos,
+		     size_t n_infos,
+		     struct raft_entry **entries,
+		     size_t *n_entries)
+{
+	raft_index next_index;          /* Next entry to load from disk */
+	struct raft_entry *tmp_entries; /* Entries in current segment */
+	size_t tmp_n; /* Number of entries in current segment */
+	size_t i;
+	int rv;
+
+	assert(start_index >= 1);
+	assert(n_infos > 0);
+
+	*entries = NULL;
+	*n_entries = 0;
+
+	next_index = start_index;
+
+	for (i = 0; i < n_infos; i++) {
+		struct uvSegmentInfo *info = &infos[i];
+
+		tracef("load segment %s", info->filename);
+
+		if (info->is_open) {
+			rv = uvSegmentLoadOpen(uv, info, entries, n_entries,
+					       &next_index);
+			ErrMsgWrapf(uv->io->errmsg, "load open segment %s",
+				    info->filename);
+			if (rv != 0) {
+				if (rv == RAFT_CORRUPT && uv->auto_recovery) {
+					uvRecoverFromCorruptSegment(
+					    uv, i, infos, n_infos);
+				}
+				goto err;
+			}
+		} else {
+			assert(info->first_index >= start_index);
+			assert(info->first_index <= info->end_index);
+
+			/* Check that the start index encoded in the name of the
+			 * segment matches what we expect and there are no gaps
+			 * in the sequence. */
+			if (info->first_index != next_index) {
+				ErrMsgPrintf(uv->io->errmsg,
+					     "unexpected closed segment %s: "
+					     "first index should "
+					     "have been %llu",
+					     info->filename, next_index);
+				rv = RAFT_CORRUPT;
+				goto err;
+			}
+
+			rv =
+			    uvSegmentLoadClosed(uv, info, &tmp_entries, &tmp_n);
+			if (rv != 0) {
+				ErrMsgWrapf(uv->io->errmsg,
+					    "load closed segment %s",
+					    info->filename);
+				if (rv == RAFT_CORRUPT && uv->auto_recovery) {
+					uvRecoverFromCorruptSegment(
+					    uv, i, infos, n_infos);
+				}
+				goto err;
+			}
+
+			assert(tmp_n > 0);
+			rv = extendEntries(tmp_entries, tmp_n, entries,
+					   n_entries);
+			if (rv != 0) {
+				/* TODO: release memory of entries in
+				 * tmp_entries */
+				goto err;
+			}
+
+			raft_free(tmp_entries);
+			next_index += tmp_n;
+		}
+	}
+
+	return 0;
+
+err:
+	assert(rv != 0);
+
+	/* Free any batch that we might have allocated and the entries array as
+	 * well. */
+	if (*entries != NULL) {
+		void *batch = NULL;
+
+		for (i = 0; i < *n_entries; i++) {
+			struct raft_entry *entry = &(*entries)[i];
+
+			if (entry->batch != batch) {
+				batch = entry->batch;
+				raft_free(batch);
+			}
+		}
+
+		raft_free(*entries);
+		*entries = NULL;
+		*n_entries = 0;
+	}
+
+	return rv;
+}
+
+/* Write a closed segment */
+static int uvWriteClosedSegment(struct uv *uv,
+				raft_index first_index,
+				raft_index last_index,
+				const struct raft_buffer *conf)
+{
+	char filename[UV__FILENAME_LEN];
+	struct uvSegmentBuffer buf = {0};
+	struct raft_buffer data;
+	struct raft_entry entry = {0};
+	size_t cap;
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int rv;
+
+	assert(first_index <= last_index);
+
+	/* Render the path */
+	sprintf(filename, UV__CLOSED_TEMPLATE, first_index, last_index);
+
+	/* Make sure that the given encoded configuration fits in the first
+	 * block */
+	cap = uv->block_size -
+	      (sizeof(uint64_t) /* Format version */ +
+	       sizeof(uint64_t) /* Checksums */ + uvSizeofBatchHeader(1));
+	if (conf->len > cap) {
+		return RAFT_TOOBIG;
+	}
+
+	uvSegmentBufferInit(&buf, uv->block_size);
+
+	rv = uvSegmentBufferFormat(&buf);
+	if (rv != 0) {
+		return rv;
+	}
+
+	entry.term = 1;
+	entry.type = RAFT_CHANGE;
+	entry.buf = *conf;
+
+	rv = uvSegmentBufferAppend(&buf, &entry, 1);
+	if (rv != 0) {
+		uvSegmentBufferClose(&buf);
+		return rv;
+	}
+
+	data.base = buf.arena.base;
+	data.len = buf.n;
+	rv = UvFsMakeFile(uv->dir, filename, &data, 1, errmsg);
+	uvSegmentBufferClose(&buf);
+	if (rv != 0) {
+		tracef("write segment %s: %s", filename, errmsg);
+		return RAFT_IOERR;
+	}
+
+	return 0;
+}
+
+int uvSegmentCreateFirstClosed(struct uv *uv,
+			       const struct raft_configuration *configuration)
+{
+	return uvSegmentCreateClosedWithConfiguration(uv, 1, configuration);
+}
+
+int uvSegmentCreateClosedWithConfiguration(
+    struct uv *uv,
+    raft_index index,
+    const struct raft_configuration *configuration)
+{
+	struct raft_buffer buf;
+	char filename[UV__FILENAME_LEN];
+	int rv;
+
+	/* Render the path */
+	sprintf(filename, UV__CLOSED_TEMPLATE, index, index);
+
+	/* Encode the given configuration. */
+	rv = configurationEncode(configuration, &buf);
+	if (rv != 0) {
+		goto err;
+	}
+
+	/* Write the file */
+	rv = uvWriteClosedSegment(uv, index, index, &buf);
+	if (rv != 0) {
+		goto err_after_configuration_encode;
+	}
+
+	raft_free(buf.base);
+
+	rv = UvFsSyncDir(uv->dir, uv->io->errmsg);
+	if (rv != 0) {
+		return RAFT_IOERR;
+	}
+
+	return 0;
+
+err_after_configuration_encode:
+	raft_free(buf.base);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+int uvSegmentTruncate(struct uv *uv,
+		      struct uvSegmentInfo *segment,
+		      raft_index index)
+{
+	char filename[UV__FILENAME_LEN];
+	struct raft_entry *entries;
+	struct uvSegmentBuffer buf;
+	struct raft_buffer data;
+	size_t n;
+	unsigned m;
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int rv;
+
+	assert(!segment->is_open);
+
+	tracef("truncate %llu-%llu at %llu", segment->first_index,
+	       segment->end_index, index);
+
+	rv = uvSegmentLoadClosed(uv, segment, &entries, &n);
+	if (rv != 0) {
+		ErrMsgWrapf(uv->io->errmsg, "load closed segment %s",
+			    segment->filename);
+		goto out;
+	}
+
+	/* Discard all entries after the truncate index (included) */
+	assert(index - segment->first_index < n);
+	m = (unsigned)(index - segment->first_index);
+
+	uvSegmentBufferInit(&buf, uv->block_size);
+
+	rv = uvSegmentBufferFormat(&buf);
+	if (rv != 0) {
+		goto out_after_buffer_init;
+	}
+
+	rv = uvSegmentBufferAppend(&buf, entries, m);
+	if (rv != 0) {
+		goto out_after_buffer_init;
+	}
+
+	/* Render the path.
+	 *
+	 * TODO: we should use a temporary file name so in case of crash we
+	 * don't consider this segment as corrupted.
+	 */
+	sprintf(filename, UV__CLOSED_TEMPLATE, segment->first_index, index - 1);
+
+	data.base = buf.arena.base;
+	data.len = buf.n;
+
+	rv = UvFsMakeFile(uv->dir, filename, &data, 1, errmsg);
+	if (rv != 0) {
+		tracef("write %s: %s", filename, errmsg);
+		rv = RAFT_IOERR;
+		goto out_after_buffer_init;
+	}
+
+out_after_buffer_init:
+	uvSegmentBufferClose(&buf);
+	entryBatchesDestroy(entries, n);
+out:
+	return rv;
+}
+
+#undef tracef
diff --git a/src/raft/uv_send.c b/src/raft/uv_send.c
new file mode 100644
index 000000000..86133542c
--- /dev/null
+++ b/src/raft/uv_send.c
@@ -0,0 +1,519 @@
+#include <string.h>
+
+#include "../raft.h"
+#include "assert.h"
+#include "heap.h"
+#include "uv.h"
+#include "uv_encoding.h"
+
+/* The happy path for an raft_io_send request is:
+ *
+ * - Get the uvClient object whose address matches the one of target server.
+ * - Encode the message and write it using the uvClient's TCP handle.
+ * - Once the write completes, fire the send request callback.
+ *
+ * Possible failure modes are:
+ *
+ * - The uv->clients queue has no client object with a matching address. In this
+ *   case add a new client object to the array, add the send request to the
+ *   queue of pending requests and submit a connection request. Once the
+ *   connection request succeeds, try to write the encoded request to the
+ *   connected stream handle. If the connection request fails, schedule another
+ *   attempt.
+ *
+ * - The uv->clients queue has a client object which is not connected. Add the
+ *   send request to the pending queue, and, if there's no connection attempt
+ *   already in progress, start a new one.
+ *
+ * - The write request fails (either synchronously or asynchronously). In this
+ *   case we fire the request callback with an error, close the connection
+ *   stream, and start a re-connection attempt.
+ */
+
+/* Maximum number of requests that can be buffered.  */
+#define UV__CLIENT_MAX_PENDING 3
+
+struct uvClient
+{
+	struct uv *uv;                  /* libuv I/O implementation object */
+	struct uv_timer_s timer;        /* Schedule connection attempts */
+	struct raft_uv_connect connect; /* Connection request */
+	struct uv_stream_s *stream;     /* Current connection handle */
+	struct uv_stream_s *old_stream; /* Connection handle being closed */
+	unsigned n_connect_attempt;     /* Consecutive connection attempts */
+	raft_id id;                     /* ID of the other server */
+	char *address;                  /* Address of the other server */
+	queue pending;                  /* Pending send message requests */
+	queue queue;                    /* Clients queue */
+	bool closing;                   /* True after calling uvClientAbort */
+};
+
+/* Hold state for a single send RPC message request. */
+struct uvSend
+{
+	struct uvClient *client;  /* Client connected to the target server */
+	struct raft_io_send *req; /* User request */
+	uv_buf_t *bufs;           /* Encoded raft RPC message to send */
+	unsigned n_bufs;          /* Number of buffers */
+	uv_write_t write;         /* Stream write request */
+	queue queue;              /* Pending send requests queue */
+};
+
+/* Free all memory used by the given send request object, including the object
+ * itself. */
+static void uvSendDestroy(struct uvSend *s)
+{
+	if (s->bufs != NULL) {
+		/* Just release the first buffer. Further buffers are entry or
+		 * snapshot payloads, which we were passed but we don't own. */
+		RaftHeapFree(s->bufs[0].base);
+
+		/* Release the buffers array. */
+		RaftHeapFree(s->bufs);
+	}
+	RaftHeapFree(s);
+}
+
+/* Initialize a new client associated with the given server. */
+static int uvClientInit(struct uvClient *c,
+			struct uv *uv,
+			raft_id id,
+			const char *address)
+{
+	int rv;
+	c->uv = uv;
+	c->timer.data = c;
+	c->connect.data = NULL; /* Set upon starting a connect request */
+	c->stream = NULL;       /* Set upon successful connection */
+	c->old_stream = NULL;   /* Set after closing the current connection */
+	c->n_connect_attempt = 0;
+	c->id = id;
+	c->address = RaftHeapMalloc(strlen(address) + 1);
+	if (c->address == NULL) {
+		return RAFT_NOMEM;
+	}
+	rv = uv_timer_init(c->uv->loop, &c->timer);
+	assert(rv == 0);
+	strcpy(c->address, address);
+	QUEUE_INIT(&c->pending);
+	c->closing = false;
+	QUEUE_PUSH(&uv->clients, &c->queue);
+	return 0;
+}
+
+/* If there's no more pending cleanup, remove the client from the abort queue
+ * and destroy it. */
+static void uvClientMaybeDestroy(struct uvClient *c)
+{
+	struct uv *uv = c->uv;
+
+	assert(c->stream == NULL);
+
+	if (c->connect.data != NULL) {
+		return;
+	}
+	if (c->timer.data != NULL) {
+		return;
+	}
+	if (c->old_stream != NULL) {
+		return;
+	}
+
+	while (!QUEUE_IS_EMPTY(&c->pending)) {
+		queue *head;
+		struct uvSend *send;
+		struct raft_io_send *req;
+		head = QUEUE_HEAD(&c->pending);
+		send = QUEUE_DATA(head, struct uvSend, queue);
+		QUEUE_REMOVE(head);
+		req = send->req;
+		uvSendDestroy(send);
+		if (req->cb != NULL) {
+			req->cb(req, RAFT_CANCELED);
+		}
+	}
+
+	QUEUE_REMOVE(&c->queue);
+
+	assert(c->address != NULL);
+	RaftHeapFree(c->address);
+	RaftHeapFree(c);
+
+	uvMaybeFireCloseCb(uv);
+}
+
+/* Forward declaration. */
+static void uvClientConnect(struct uvClient *c);
+
+static void uvClientDisconnectCloseCb(struct uv_handle_s *handle)
+{
+	struct uvClient *c = handle->data;
+	assert(c->old_stream != NULL);
+	assert(c->stream == NULL);
+	assert(handle == (struct uv_handle_s *)c->old_stream);
+	RaftHeapFree(c->old_stream);
+	c->old_stream = NULL;
+	if (c->closing) {
+		uvClientMaybeDestroy(c);
+	} else {
+		uvClientConnect(c); /* Trigger a new connection attempt. */
+	}
+}
+
+/* Close the current connection. */
+static void uvClientDisconnect(struct uvClient *c)
+{
+	assert(c->stream != NULL);
+	assert(c->old_stream == NULL);
+	c->old_stream = c->stream;
+	c->stream = NULL;
+	uv_close((struct uv_handle_s *)c->old_stream,
+		 uvClientDisconnectCloseCb);
+}
+
+/* Invoked once an encoded RPC message has been written out. */
+static void uvSendWriteCb(struct uv_write_s *write, const int status)
+{
+	struct uvSend *send = write->data;
+	struct uvClient *c = send->client;
+	struct raft_io_send *req = send->req;
+	int cb_status = 0;
+
+	/* If the write failed and we're not currently closing, let's consider
+	 * the current stream handle as busted and start disconnecting (unless
+	 * we're already doing so). We'll trigger a new connection attempt once
+	 * the handle is closed. */
+	if (status != 0) {
+		cb_status = RAFT_IOERR;
+		if (!c->closing) {
+			if (c->stream != NULL) {
+				uvClientDisconnect(c);
+			}
+		} else if (status == UV_ECANCELED) {
+			cb_status = RAFT_CANCELED;
+		}
+	}
+
+	uvSendDestroy(send);
+
+	if (req->cb != NULL) {
+		req->cb(req, cb_status);
+	}
+}
+
+static int uvClientSend(struct uvClient *c, struct uvSend *send)
+{
+	int rv;
+	assert(!c->closing);
+	send->client = c;
+
+	/* If there's no connection available, let's queue the request. */
+	if (c->stream == NULL) {
+		tracef("no connection available -> enqueue message");
+		QUEUE_PUSH(&c->pending, &send->queue);
+		return 0;
+	}
+
+	tracef("connection available -> write message");
+	send->write.data = send;
+	rv = uv_write(&send->write, c->stream, send->bufs, send->n_bufs,
+		      uvSendWriteCb);
+	if (rv != 0) {
+		tracef("write message failed -> rv %d", rv);
+		/* UNTESTED: what are the error conditions? perhaps ENOMEM */
+		return RAFT_IOERR;
+	}
+
+	return 0;
+}
+
+/* Try to execute all send requests that were blocked in the queue waiting for a
+ * connection. */
+static void uvClientSendPending(struct uvClient *c)
+{
+	int rv;
+	assert(c->stream != NULL);
+	tracef("send pending messages");
+	while (!QUEUE_IS_EMPTY(&c->pending)) {
+		queue *head;
+		struct uvSend *send;
+		head = QUEUE_HEAD(&c->pending);
+		send = QUEUE_DATA(head, struct uvSend, queue);
+		QUEUE_REMOVE(head);
+		rv = uvClientSend(c, send);
+		if (rv != 0) {
+			if (send->req->cb != NULL) {
+				send->req->cb(send->req, rv);
+			}
+			uvSendDestroy(send);
+		}
+	}
+}
+
+static void uvClientTimerCb(uv_timer_t *timer)
+{
+	struct uvClient *c = timer->data;
+	tracef("timer expired -> attempt to reconnect");
+	uvClientConnect(c); /* Retry to connect. */
+}
+
+/* Return the number of send requests that we have been parked in the send queue
+ * because no connection is available yet. */
+static unsigned uvClientPendingCount(struct uvClient *c)
+{
+	queue *head;
+	unsigned n = 0;
+	QUEUE_FOREACH(head, &c->pending)
+	{
+		n++;
+	}
+	return n;
+}
+
+static void uvClientConnectCb(struct raft_uv_connect *req,
+			      struct uv_stream_s *stream,
+			      int status)
+{
+	struct uvClient *c = req->data;
+	unsigned n_pending;
+	int rv;
+
+	tracef("connect attempt completed -> status %s",
+	       errCodeToString(status));
+
+	assert(c->connect.data != NULL);
+	assert(c->stream == NULL);
+	assert(c->old_stream == NULL);
+	assert(!uv_is_active((struct uv_handle_s *)&c->timer));
+
+	c->connect.data = NULL;
+
+	/* If we are closing, bail out, possibly discarding the new connection.
+	 */
+	if (c->closing) {
+		if (status == 0) {
+			assert(stream != NULL);
+			c->stream = stream;
+			c->stream->data = c;
+			uvClientDisconnect(c);
+		} else {
+			uvClientMaybeDestroy(c);
+		}
+		return;
+	}
+
+	/* If, the connection attempt was successful, we're good. If we have
+	 * pending requests, let's try to execute them. */
+	if (status == 0) {
+		assert(stream != NULL);
+		c->stream = stream;
+		c->n_connect_attempt = 0;
+		c->stream->data = c;
+		uvClientSendPending(c);
+		return;
+	}
+
+	/* Shrink the queue of pending requests, by failing the oldest ones */
+	n_pending = uvClientPendingCount(c);
+	if (n_pending > UV__CLIENT_MAX_PENDING) {
+		unsigned i;
+		for (i = 0; i < n_pending - UV__CLIENT_MAX_PENDING; i++) {
+			tracef("queue full -> evict oldest message");
+			queue *head;
+			struct uvSend *old_send;
+			struct raft_io_send *old_req;
+			head = QUEUE_HEAD(&c->pending);
+			old_send = QUEUE_DATA(head, struct uvSend, queue);
+			QUEUE_REMOVE(head);
+			old_req = old_send->req;
+			uvSendDestroy(old_send);
+			if (old_req->cb != NULL) {
+				old_req->cb(old_req, RAFT_NOCONNECTION);
+			}
+		}
+	}
+
+	/* Let's schedule another attempt. */
+	rv = uv_timer_start(&c->timer, uvClientTimerCb,
+			    c->uv->connect_retry_delay, 0);
+	assert(rv == 0);
+}
+
+/* Perform a single connection attempt, scheduling a retry if it fails. */
+static void uvClientConnect(struct uvClient *c)
+{
+	int rv;
+
+	assert(!c->closing);
+	assert(c->stream == NULL);
+	assert(c->old_stream == NULL);
+	assert(!uv_is_active((struct uv_handle_s *)&c->timer));
+	assert(c->connect.data == NULL);
+
+	c->n_connect_attempt++;
+
+	c->connect.data = c;
+	rv = c->uv->transport->connect(c->uv->transport, &c->connect, c->id,
+				       c->address, uvClientConnectCb);
+	if (rv != 0) {
+		/* Restart the timer, so we can retry. */
+		c->connect.data = NULL;
+		rv = uv_timer_start(&c->timer, uvClientTimerCb,
+				    c->uv->connect_retry_delay, 0);
+		assert(rv == 0);
+	}
+}
+
+/* Final callback in the close chain of an io_uv__client object */
+static void uvClientTimerCloseCb(struct uv_handle_s *handle)
+{
+	struct uvClient *c = handle->data;
+	assert(handle == (struct uv_handle_s *)&c->timer);
+	c->timer.data = NULL;
+	uvClientMaybeDestroy(c);
+}
+
+/* Start shutting down a client. This happens when the `raft_io` instance
+ * has been closed or when the address of the client has changed. */
+static void uvClientAbort(struct uvClient *c)
+{
+	struct uv *uv = c->uv;
+	int rv;
+
+	assert(c->stream != NULL || c->old_stream != NULL ||
+	       uv_is_active((struct uv_handle_s *)&c->timer) ||
+	       c->connect.data != NULL);
+
+	QUEUE_REMOVE(&c->queue);
+	QUEUE_PUSH(&uv->aborting, &c->queue);
+
+	rv = uv_timer_stop(&c->timer);
+	assert(rv == 0);
+
+	/* If we are connected, let's close the outbound stream handle. This
+	 * will eventually complete all inflight write requests, possibly with
+	 * failing them with UV_ECANCELED. */
+	if (c->stream != NULL) {
+		uvClientDisconnect(c);
+	}
+
+	/* Closing the timer implicitly stop it, so the timeout callback won't
+	 * be fired. */
+	uv_close((struct uv_handle_s *)&c->timer, uvClientTimerCloseCb);
+	c->closing = true;
+}
+
+/* Find the client object associated with the given server, or create one if
+ * there's none yet. */
+static int uvGetClient(struct uv *uv,
+		       const raft_id id,
+		       const char *address,
+		       struct uvClient **client)
+{
+	queue *head;
+	int rv;
+
+	/* Check if we already have a client object for this peer server. */
+	QUEUE_FOREACH(head, &uv->clients)
+	{
+		*client = QUEUE_DATA(head, struct uvClient, queue);
+		if ((*client)->id != id) {
+			continue;
+		}
+
+		/* Client address has changed, abort connection and create a new
+		 * one. */
+		if (strcmp((*client)->address, address) != 0) {
+			uvClientAbort(*client);
+			break;
+		}
+
+		return 0;
+	}
+
+	/* Initialize the new connection */
+	*client = RaftHeapMalloc(sizeof **client);
+	if (*client == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	rv = uvClientInit(*client, uv, id, address);
+	if (rv != 0) {
+		goto err_after_client_alloc;
+	}
+
+	/* Make a first connection attempt right away.. */
+	uvClientConnect(*client);
+
+	return 0;
+
+err_after_client_alloc:
+	RaftHeapFree(*client);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+int UvSend(struct raft_io *io,
+	   struct raft_io_send *req,
+	   const struct raft_message *message,
+	   raft_io_send_cb cb)
+{
+	struct uv *uv = io->impl;
+	struct uvSend *send;
+	struct uvClient *client;
+	int rv;
+
+	assert(!uv->closing);
+
+	/* Allocate a new request object. */
+	send = RaftHeapMalloc(sizeof *send);
+	if (send == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+	send->req = req;
+	req->cb = cb;
+
+	rv = uvEncodeMessage(message, &send->bufs, &send->n_bufs);
+	if (rv != 0) {
+		send->bufs = NULL;
+		goto err_after_send_alloc;
+	}
+
+	/* Get a client object connected to the target server, creating it if it
+	 * doesn't exist yet. */
+	rv = uvGetClient(uv, message->server_id, message->server_address,
+			 &client);
+	if (rv != 0) {
+		goto err_after_send_alloc;
+	}
+
+	rv = uvClientSend(client, send);
+	if (rv != 0) {
+		goto err_after_send_alloc;
+	}
+
+	return 0;
+
+err_after_send_alloc:
+	uvSendDestroy(send);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+void UvSendClose(struct uv *uv)
+{
+	assert(uv->closing);
+	while (!QUEUE_IS_EMPTY(&uv->clients)) {
+		queue *head;
+		struct uvClient *client;
+		head = QUEUE_HEAD(&uv->clients);
+		client = QUEUE_DATA(head, struct uvClient, queue);
+		uvClientAbort(client);
+	}
+}
+
+#undef tracef
diff --git a/src/raft/uv_snapshot.c b/src/raft/uv_snapshot.c
new file mode 100644
index 000000000..d4b8910d1
--- /dev/null
+++ b/src/raft/uv_snapshot.c
@@ -0,0 +1,808 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "array.h"
+#include "assert.h"
+#include "byte.h"
+#include "compress.h"
+#include "configuration.h"
+#include "heap.h"
+#include "uv.h"
+#include "uv_encoding.h"
+#include "uv_os.h"
+
+/* Arbitrary maximum configuration size. Should be practically be enough */
+#define UV__META_MAX_CONFIGURATION_SIZE 1024 * 1024
+
+/* Returns true if the filename is a valid snapshot file or snapshot meta
+ * filename depending on the `meta` switch. If the parse is successful, the
+ * arguments will contain the parsed values. */
+static bool uvSnapshotParseFilename(const char *filename,
+				    bool meta,
+				    raft_term *term,
+				    raft_index *index,
+				    raft_time *timestamp)
+{
+	/* Check if it's a well-formed snapshot filename */
+	int consumed = 0;
+	int matched;
+	size_t filename_len = strlen(filename);
+	assert(filename_len < UV__FILENAME_LEN);
+	if (meta) {
+		matched = sscanf(filename, UV__SNAPSHOT_META_TEMPLATE "%n",
+				 term, index, timestamp, &consumed);
+	} else {
+		matched = sscanf(filename, UV__SNAPSHOT_TEMPLATE "%n", term,
+				 index, timestamp, &consumed);
+	}
+	if (matched != 3 || consumed != (int)filename_len) {
+		return false;
+	}
+
+	return true;
+}
+
+/* Check if the given filename matches the pattern of a snapshot metadata
+ * filename (snapshot-xxx-yyy-zzz.meta), and fill the given info structure if
+ * so.
+ *
+ * Return true if the filename matched, false otherwise. */
+static bool uvSnapshotInfoMatch(const char *filename,
+				struct uvSnapshotInfo *info)
+{
+	if (!uvSnapshotParseFilename(filename, true, &info->term, &info->index,
+				     &info->timestamp)) {
+		return false;
+	}
+	/* Allow room for '\0' terminator */
+	size_t n = sizeof(info->filename) - 1;
+	strncpy(info->filename, filename, n);
+	info->filename[n] = '\0';
+	return true;
+}
+
+void uvSnapshotFilenameOf(struct uvSnapshotInfo *info, char *filename)
+{
+	size_t len = strlen(info->filename) - strlen(".meta");
+	assert(len < UV__FILENAME_LEN);
+	strcpy(filename, info->filename);
+	filename[len] = 0;
+}
+
+int UvSnapshotInfoAppendIfMatch(struct uv *uv,
+				const char *filename,
+				struct uvSnapshotInfo *infos[],
+				size_t *n_infos,
+				bool *appended)
+{
+	struct uvSnapshotInfo info;
+	bool matched;
+	char snapshot_filename[UV__FILENAME_LEN];
+	bool exists;
+	bool is_empty;
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int rv;
+
+	/* Check if it's a snapshot metadata filename */
+	matched = uvSnapshotInfoMatch(filename, &info);
+	if (!matched) {
+		*appended = false;
+		return 0;
+	}
+
+	/* Check if there's actually a valid snapshot file for this snapshot
+	 * metadata. If there's none or it's empty, it means that we aborted
+	 * before finishing the snapshot, or that another thread is still busy
+	 * writing the snapshot. */
+	uvSnapshotFilenameOf(&info, snapshot_filename);
+	rv = UvFsFileExists(uv->dir, snapshot_filename, &exists, errmsg);
+	if (rv != 0) {
+		tracef("stat %s: %s", snapshot_filename, errmsg);
+		rv = RAFT_IOERR;
+		return rv;
+	}
+	if (!exists) {
+		*appended = false;
+		return 0;
+	}
+
+	/* TODO This check is strictly not needed, snapshot files are created by
+	 * renaming fully written and synced tmp-files. Leaving it here, just to
+	 * be extra-safe. Can probably be removed once more data integrity
+	 * checks are performed at startup. */
+	rv = UvFsFileIsEmpty(uv->dir, snapshot_filename, &is_empty, errmsg);
+	if (rv != 0) {
+		tracef("is_empty %s: %s", snapshot_filename, errmsg);
+		rv = RAFT_IOERR;
+		return rv;
+	}
+	if (is_empty) {
+		*appended = false;
+		return 0;
+	}
+
+	ARRAY__APPEND(struct uvSnapshotInfo, info, infos, n_infos, rv);
+	if (rv == -1) {
+		return RAFT_NOMEM;
+	}
+	*appended = true;
+
+	return 0;
+}
+
+static int uvSnapshotIsOrphanInternal(const char *dir,
+				      const char *filename,
+				      bool meta,
+				      bool *orphan)
+{
+	int rv;
+	*orphan = false;
+
+	raft_term term;
+	raft_index index;
+	raft_time timestamp;
+	if (!uvSnapshotParseFilename(filename, meta, &term, &index,
+				     &timestamp)) {
+		return 0;
+	}
+
+	/* filename is a well-formed snapshot filename, check if the sibling
+	 * file exists. */
+	char sibling_filename[UV__FILENAME_LEN];
+	if (meta) {
+		rv = snprintf(sibling_filename, UV__FILENAME_LEN,
+			      UV__SNAPSHOT_TEMPLATE, term, index, timestamp);
+	} else {
+		rv = snprintf(sibling_filename, UV__FILENAME_LEN,
+			      UV__SNAPSHOT_META_TEMPLATE, term, index,
+			      timestamp);
+	}
+
+	if (rv >= UV__FILENAME_LEN) {
+		/* Output truncated */
+		return -1;
+	}
+
+	bool sibling_exists = false;
+	char ignored[RAFT_ERRMSG_BUF_SIZE];
+	rv = UvFsFileExists(dir, sibling_filename, &sibling_exists, ignored);
+	if (rv != 0) {
+		return rv;
+	}
+
+	*orphan = !sibling_exists;
+	return 0;
+}
+
+int UvSnapshotIsOrphan(const char *dir, const char *filename, bool *orphan)
+{
+	return uvSnapshotIsOrphanInternal(dir, filename, false, orphan);
+}
+
+int UvSnapshotMetaIsOrphan(const char *dir, const char *filename, bool *orphan)
+{
+	return uvSnapshotIsOrphanInternal(dir, filename, true, orphan);
+}
+
+/* Compare two snapshots to decide which one is more recent. */
+static int uvSnapshotCompare(const void *p1, const void *p2)
+{
+	struct uvSnapshotInfo *s1 = (struct uvSnapshotInfo *)p1;
+	struct uvSnapshotInfo *s2 = (struct uvSnapshotInfo *)p2;
+
+	/* If terms are different, the snapshot with the highest term is the
+	 * most recent. */
+	if (s1->term != s2->term) {
+		return s1->term < s2->term ? -1 : 1;
+	}
+
+	/* If the term are identical and the index differ, the snapshot with the
+	 * highest index is the most recent */
+	if (s1->index != s2->index) {
+		return s1->index < s2->index ? -1 : 1;
+	}
+
+	/* If term and index are identical, compare the timestamp. */
+	return s1->timestamp < s2->timestamp ? -1 : 1;
+}
+
+/* Sort the given snapshots. */
+void UvSnapshotSort(struct uvSnapshotInfo *infos, size_t n_infos)
+{
+	qsort(infos, n_infos, sizeof *infos, uvSnapshotCompare);
+}
+
+/* Parse the metadata file of a snapshot and populate the metadata portion of
+ * the given snapshot object accordingly. */
+static int uvSnapshotLoadMeta(struct uv *uv,
+			      struct uvSnapshotInfo *info,
+			      struct raft_snapshot *snapshot,
+			      char *errmsg)
+{
+	uint64_t header[1 + /* Format version */
+			1 + /* CRC checksum */
+			1 + /* Configuration index */
+			1 /* Configuration length */];
+	struct raft_buffer buf;
+	uint64_t format;
+	uint32_t crc1;
+	uint32_t crc2;
+	uv_file fd;
+	int rv;
+
+	snapshot->term = info->term;
+	snapshot->index = info->index;
+
+	rv = UvFsOpenFileForReading(uv->dir, info->filename, &fd, errmsg);
+	if (rv != 0) {
+		tracef("open %s: %s", info->filename, errmsg);
+		rv = RAFT_IOERR;
+		goto err;
+	}
+	buf.base = header;
+	buf.len = sizeof header;
+	rv = UvFsReadInto(fd, &buf, errmsg);
+	if (rv != 0) {
+		tracef("read %s: %s", info->filename, errmsg);
+		rv = RAFT_IOERR;
+		goto err_after_open;
+	}
+
+	format = byteFlip64(header[0]);
+	if (format != UV__DISK_FORMAT) {
+		tracef("load %s: unsupported format %ju", info->filename,
+		       format);
+		rv = RAFT_MALFORMED;
+		goto err_after_open;
+	}
+
+	crc1 = (uint32_t)byteFlip64(header[1]);
+
+	snapshot->configuration_index = byteFlip64(header[2]);
+	buf.len = (size_t)byteFlip64(header[3]);
+	if (buf.len > UV__META_MAX_CONFIGURATION_SIZE) {
+		tracef("load %s: configuration data too big (%zd)",
+		       info->filename, buf.len);
+		rv = RAFT_CORRUPT;
+		goto err_after_open;
+	}
+	if (buf.len == 0) {
+		tracef("load %s: no configuration data", info->filename);
+		rv = RAFT_CORRUPT;
+		goto err_after_open;
+	}
+	buf.base = RaftHeapMalloc(buf.len);
+	if (buf.base == NULL) {
+		rv = RAFT_NOMEM;
+		goto err_after_open;
+	}
+
+	rv = UvFsReadInto(fd, &buf, errmsg);
+	if (rv != 0) {
+		tracef("read %s: %s", info->filename, errmsg);
+		rv = RAFT_IOERR;
+		goto err_after_buf_malloc;
+	}
+
+	crc2 = byteCrc32(header + 2, sizeof header - sizeof(uint64_t) * 2, 0);
+	crc2 = byteCrc32(buf.base, buf.len, crc2);
+
+	if (crc1 != crc2) {
+		ErrMsgPrintf(errmsg, "read %s: checksum mismatch",
+			     info->filename);
+		rv = RAFT_CORRUPT;
+		goto err_after_buf_malloc;
+	}
+
+	rv = configurationDecode(&buf, &snapshot->configuration);
+	if (rv != 0) {
+		goto err_after_buf_malloc;
+	}
+
+	RaftHeapFree(buf.base);
+	UvOsClose(fd);
+
+	return 0;
+
+err_after_buf_malloc:
+	RaftHeapFree(buf.base);
+
+err_after_open:
+	close(fd);
+
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+/* Load the snapshot data file and populate the data portion of the given
+ * snapshot object accordingly. */
+static int uvSnapshotLoadData(struct uv *uv,
+			      struct uvSnapshotInfo *info,
+			      struct raft_snapshot *snapshot,
+			      char *errmsg)
+{
+	char filename[UV__FILENAME_LEN];
+	struct raft_buffer buf;
+	int rv;
+
+	uvSnapshotFilenameOf(info, filename);
+
+	rv = UvFsReadFile(uv->dir, filename, &buf, errmsg);
+	if (rv != 0) {
+		tracef("stat %s: %s", filename, errmsg);
+		goto err;
+	}
+
+	if (IsCompressed(buf.base, buf.len)) {
+		struct raft_buffer decompressed = {0};
+		tracef("snapshot decompress start");
+		rv = Decompress(buf, &decompressed, errmsg);
+		tracef("snapshot decompress end %d", rv);
+		if (rv != 0) {
+			tracef("decompress failed rv:%d", rv);
+			goto err_after_read_file;
+		}
+		RaftHeapFree(buf.base);
+		buf = decompressed;
+	}
+
+	snapshot->bufs = RaftHeapMalloc(sizeof *snapshot->bufs);
+	snapshot->n_bufs = 1;
+	if (snapshot->bufs == NULL) {
+		rv = RAFT_NOMEM;
+		goto err_after_read_file;
+	}
+
+	snapshot->bufs[0] = buf;
+	return 0;
+
+err_after_read_file:
+	RaftHeapFree(buf.base);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+int UvSnapshotLoad(struct uv *uv,
+		   struct uvSnapshotInfo *meta,
+		   struct raft_snapshot *snapshot,
+		   char *errmsg)
+{
+	int rv;
+	rv = uvSnapshotLoadMeta(uv, meta, snapshot, errmsg);
+	if (rv != 0) {
+		return rv;
+	}
+	rv = uvSnapshotLoadData(uv, meta, snapshot, errmsg);
+	if (rv != 0) {
+		return rv;
+	}
+	return 0;
+}
+
+struct uvSnapshotPut
+{
+	struct uv *uv;
+	size_t trailing;
+	struct raft_io_snapshot_put *req;
+	const struct raft_snapshot *snapshot;
+	struct
+	{
+		unsigned long long timestamp;
+		uint64_t header[4]; /* Format, CRC, configuration index/len */
+		struct raft_buffer bufs[2]; /* Preamble and configuration */
+	} meta;
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int status;
+	struct UvBarrierReq barrier;
+};
+
+struct uvSnapshotGet
+{
+	struct uv *uv;
+	struct raft_io_snapshot_get *req;
+	struct raft_snapshot *snapshot;
+	struct uv_work_s work;
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int status;
+	queue queue;
+};
+
+static int uvSnapshotKeepLastTwo(struct uv *uv,
+				 struct uvSnapshotInfo *snapshots,
+				 size_t n)
+{
+	size_t i;
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int rv;
+
+	/* Leave at least two snapshots, for safety. */
+	if (n <= 2) {
+		return 0;
+	}
+
+	for (i = 0; i < n - 2; i++) {
+		struct uvSnapshotInfo *snapshot = &snapshots[i];
+		char filename[UV__FILENAME_LEN];
+		rv = UvFsRemoveFile(uv->dir, snapshot->filename, errmsg);
+		if (rv != 0) {
+			tracef("unlink %s: %s", snapshot->filename, errmsg);
+			return RAFT_IOERR;
+		}
+		uvSnapshotFilenameOf(snapshot, filename);
+		rv = UvFsRemoveFile(uv->dir, filename, errmsg);
+		if (rv != 0) {
+			tracef("unlink %s: %s", filename, errmsg);
+			return RAFT_IOERR;
+		}
+	}
+
+	return 0;
+}
+
+/* Remove all segments and snapshots that are not needed anymore, because their
+   past the trailing amount. */
+static int uvRemoveOldSegmentsAndSnapshots(struct uv *uv,
+					   raft_index last_index,
+					   size_t trailing,
+					   char *errmsg)
+{
+	struct uvSnapshotInfo *snapshots;
+	struct uvSegmentInfo *segments;
+	size_t n_snapshots;
+	size_t n_segments;
+	int rv = 0;
+
+	rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments,
+		    errmsg);
+	if (rv != 0) {
+		goto out;
+	}
+	rv = uvSnapshotKeepLastTwo(uv, snapshots, n_snapshots);
+	if (rv != 0) {
+		goto out;
+	}
+	if (segments != NULL) {
+		rv = uvSegmentKeepTrailing(uv, segments, n_segments, last_index,
+					   trailing, errmsg);
+		if (rv != 0) {
+			goto out;
+		}
+	}
+	rv = UvFsSyncDir(uv->dir, errmsg);
+
+out:
+	if (snapshots != NULL) {
+		RaftHeapFree(snapshots);
+	}
+	if (segments != NULL) {
+		RaftHeapFree(segments);
+	}
+	return rv;
+}
+
+static int makeFileCompressed(const char *dir,
+			      const char *filename,
+			      struct raft_buffer *bufs,
+			      unsigned n_bufs,
+			      char *errmsg)
+{
+	int rv;
+
+	struct raft_buffer compressed = {0};
+	rv = Compress(bufs, n_bufs, &compressed, errmsg);
+	if (rv != 0) {
+		ErrMsgWrapf(errmsg, "compress %s", filename);
+		return RAFT_IOERR;
+	}
+
+	rv = UvFsMakeFile(dir, filename, &compressed, 1, errmsg);
+	raft_free(compressed.base);
+	return rv;
+}
+
+static void uvSnapshotPutWorkCb(uv_work_t *work)
+{
+	struct uvSnapshotPut *put = work->data;
+	struct uv *uv = put->uv;
+	char metadata[UV__FILENAME_LEN];
+	char snapshot[UV__FILENAME_LEN];
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int rv;
+
+	sprintf(metadata, UV__SNAPSHOT_META_TEMPLATE, put->snapshot->term,
+		put->snapshot->index, put->meta.timestamp);
+
+	rv = UvFsMakeFile(uv->dir, metadata, put->meta.bufs, 2, put->errmsg);
+	if (rv != 0) {
+		tracef("snapshot.meta creation failed %d", rv);
+		ErrMsgWrapf(put->errmsg, "write %s", metadata);
+		put->status = RAFT_IOERR;
+		return;
+	}
+
+	sprintf(snapshot, UV__SNAPSHOT_TEMPLATE, put->snapshot->term,
+		put->snapshot->index, put->meta.timestamp);
+
+	tracef("snapshot write start");
+	if (uv->snapshot_compression) {
+		rv = makeFileCompressed(uv->dir, snapshot, put->snapshot->bufs,
+					put->snapshot->n_bufs, put->errmsg);
+	} else {
+		rv = UvFsMakeFile(uv->dir, snapshot, put->snapshot->bufs,
+				  put->snapshot->n_bufs, put->errmsg);
+	}
+	tracef("snapshot write end %d", rv);
+
+	if (rv != 0) {
+		tracef("snapshot creation failed %d", rv);
+		ErrMsgWrapf(put->errmsg, "write %s", snapshot);
+		UvFsRemoveFile(uv->dir, metadata, errmsg);
+		UvFsRemoveFile(uv->dir, snapshot, errmsg);
+		put->status = RAFT_IOERR;
+		return;
+	}
+
+	rv = UvFsSyncDir(uv->dir, put->errmsg);
+	if (rv != 0) {
+		put->status = RAFT_IOERR;
+		return;
+	}
+
+	rv = uvRemoveOldSegmentsAndSnapshots(uv, put->snapshot->index,
+					     put->trailing, put->errmsg);
+	if (rv != 0) {
+		put->status = rv;
+		return;
+	}
+
+	put->status = 0;
+
+	return;
+}
+
+/* Finish the put request, releasing all associated memory and invoking its
+ * callback. */
+static void uvSnapshotPutFinish(struct uvSnapshotPut *put)
+{
+	struct raft_io_snapshot_put *req = put->req;
+	int status = put->status;
+	struct uv *uv = put->uv;
+	assert(uv->snapshot_put_work.data == NULL);
+	RaftHeapFree(put->meta.bufs[1].base);
+	RaftHeapFree(put);
+	req->cb(req, status);
+}
+
+static void uvSnapshotPutAfterWorkCb(uv_work_t *work, int status)
+{
+	struct uvSnapshotPut *put = work->data;
+	struct uv *uv = put->uv;
+	assert(status == 0);
+	uv->snapshot_put_work.data = NULL;
+	uvSnapshotPutFinish(put);
+	UvUnblock(uv);
+}
+
+/* Start processing the given put request. */
+static void uvSnapshotPutStart(struct uvSnapshotPut *put)
+{
+	struct uv *uv = put->uv;
+	int rv;
+
+	/* If this is an install request, the barrier callback must have fired.
+	 */
+	if (put->trailing == 0) {
+		assert(put->barrier.data == NULL);
+	}
+
+	uv->snapshot_put_work.data = put;
+	rv = uv_queue_work(uv->loop, &uv->snapshot_put_work,
+			   uvSnapshotPutWorkCb, uvSnapshotPutAfterWorkCb);
+	if (rv != 0) {
+		tracef("store snapshot %lld: %s", put->snapshot->index,
+		       uv_strerror(rv));
+		uv->errored = true;
+	}
+}
+
+static void uvSnapshotPutBarrierCb(struct UvBarrierReq *barrier)
+{
+	/* Ensure that we don't invoke this callback more than once. */
+	barrier->cb = NULL;
+	struct uvSnapshotPut *put = barrier->data;
+	if (put == NULL) {
+		return;
+	}
+
+	struct uv *uv = put->uv;
+	put->barrier.data = NULL;
+	/* If we're closing, abort the request. */
+	if (uv->closing) {
+		put->status = RAFT_CANCELED;
+		uvSnapshotPutFinish(put);
+		uvMaybeFireCloseCb(uv);
+		return;
+	}
+	uvSnapshotPutStart(put);
+}
+
+int UvSnapshotPut(struct raft_io *io,
+		  unsigned trailing,
+		  struct raft_io_snapshot_put *req,
+		  const struct raft_snapshot *snapshot,
+		  raft_io_snapshot_put_cb cb)
+{
+	struct uv *uv;
+	struct uvSnapshotPut *put;
+	void *cursor;
+	unsigned crc;
+	int rv;
+	raft_index next_index;
+
+	uv = io->impl;
+	if (uv->closing) {
+		return RAFT_CANCELED;
+	}
+
+	assert(uv->snapshot_put_work.data == NULL);
+
+	tracef("put snapshot at %lld, keeping %d", snapshot->index, trailing);
+
+	put = RaftHeapMalloc(sizeof *put);
+	if (put == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+	put->uv = uv;
+	put->req = req;
+	put->snapshot = snapshot;
+	put->meta.timestamp = uv_now(uv->loop);
+	put->trailing = trailing;
+	put->barrier.data = put;
+	put->barrier.blocking = trailing == 0;
+	put->barrier.cb = uvSnapshotPutBarrierCb;
+
+	req->cb = cb;
+
+	/* Prepare the buffers for the metadata file. */
+	put->meta.bufs[0].base = put->meta.header;
+	put->meta.bufs[0].len = sizeof put->meta.header;
+
+	rv = configurationEncode(&snapshot->configuration, &put->meta.bufs[1]);
+	if (rv != 0) {
+		goto err_after_req_alloc;
+	}
+
+	cursor = put->meta.header;
+	bytePut64(&cursor, UV__DISK_FORMAT);
+	bytePut64(&cursor, 0);
+	bytePut64(&cursor, snapshot->configuration_index);
+	bytePut64(&cursor, put->meta.bufs[1].len);
+
+	crc = byteCrc32(&put->meta.header[2], sizeof(uint64_t) * 2, 0);
+	crc = byteCrc32(put->meta.bufs[1].base, put->meta.bufs[1].len, crc);
+
+	cursor = &put->meta.header[1];
+	bytePut64(&cursor, crc);
+
+	/* - If the trailing parameter is set to 0, it means that we're
+	 * restoring a snapshot. Submit a barrier request setting the next
+	 * append index to the snapshot's last index + 1.
+	 * - When we are only writing a snapshot during normal operation, we
+	 * close all current open segments. New writes can continue on newly
+	 * opened segments that will only contain entries that are newer than
+	 * the snapshot, and we don't change append_next_index. */
+	next_index =
+	    (trailing == 0) ? (snapshot->index + 1) : uv->append_next_index;
+	rv = UvBarrier(uv, next_index, &put->barrier);
+	if (rv != 0) {
+		goto err_after_configuration_encode;
+	}
+
+	return 0;
+
+err_after_configuration_encode:
+	RaftHeapFree(put->meta.bufs[1].base);
+err_after_req_alloc:
+	RaftHeapFree(put);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+static void uvSnapshotGetWorkCb(uv_work_t *work)
+{
+	struct uvSnapshotGet *get = work->data;
+	struct uv *uv = get->uv;
+	struct uvSnapshotInfo *snapshots;
+	size_t n_snapshots;
+	struct uvSegmentInfo *segments;
+	size_t n_segments;
+	int rv;
+	get->status = 0;
+	rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments,
+		    get->errmsg);
+	if (rv != 0) {
+		get->status = rv;
+		goto out;
+	}
+	if (snapshots != NULL) {
+		rv = UvSnapshotLoad(uv, &snapshots[n_snapshots - 1],
+				    get->snapshot, get->errmsg);
+		if (rv != 0) {
+			get->status = rv;
+		}
+		RaftHeapFree(snapshots);
+	}
+	if (segments != NULL) {
+		RaftHeapFree(segments);
+	}
+out:
+	return;
+}
+
+static void uvSnapshotGetAfterWorkCb(uv_work_t *work, int status)
+{
+	struct uvSnapshotGet *get = work->data;
+	struct raft_io_snapshot_get *req = get->req;
+	struct raft_snapshot *snapshot = get->snapshot;
+	int req_status = get->status;
+	struct uv *uv = get->uv;
+	assert(status == 0);
+	QUEUE_REMOVE(&get->queue);
+	RaftHeapFree(get);
+	req->cb(req, snapshot, req_status);
+	uvMaybeFireCloseCb(uv);
+}
+
+int UvSnapshotGet(struct raft_io *io,
+		  struct raft_io_snapshot_get *req,
+		  raft_io_snapshot_get_cb cb)
+{
+	struct uv *uv;
+	struct uvSnapshotGet *get;
+	int rv;
+
+	uv = io->impl;
+	assert(!uv->closing);
+
+	get = RaftHeapMalloc(sizeof *get);
+	if (get == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+	get->uv = uv;
+	get->req = req;
+	req->cb = cb;
+
+	get->snapshot = RaftHeapMalloc(sizeof *get->snapshot);
+	if (get->snapshot == NULL) {
+		rv = RAFT_NOMEM;
+		goto err_after_req_alloc;
+	}
+	get->work.data = get;
+
+	QUEUE_PUSH(&uv->snapshot_get_reqs, &get->queue);
+	rv = uv_queue_work(uv->loop, &get->work, uvSnapshotGetWorkCb,
+			   uvSnapshotGetAfterWorkCb);
+	if (rv != 0) {
+		QUEUE_REMOVE(&get->queue);
+		tracef("get last snapshot: %s", uv_strerror(rv));
+		rv = RAFT_IOERR;
+		goto err_after_snapshot_alloc;
+	}
+
+	return 0;
+
+err_after_snapshot_alloc:
+	RaftHeapFree(get->snapshot);
+err_after_req_alloc:
+	RaftHeapFree(get);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+#undef tracef
diff --git a/src/raft/uv_tcp.c b/src/raft/uv_tcp.c
new file mode 100644
index 000000000..4196b9f56
--- /dev/null
+++ b/src/raft/uv_tcp.c
@@ -0,0 +1,127 @@
+#include "uv_tcp.h"
+#include "uv_ip.h"
+
+#include <string.h>
+
+#include "../raft.h"
+#include "assert.h"
+#include "err.h"
+#include "heap.h"
+
+/* Implementation of raft_uv_transport->init. */
+static int uvTcpInit(struct raft_uv_transport *transport,
+		     raft_id id,
+		     const char *address)
+{
+	struct UvTcp *t = transport->impl;
+	assert(id > 0);
+	assert(address != NULL);
+	t->id = id;
+	t->address = address;
+	return 0;
+}
+
+/* Implementation of raft_uv_transport->close. */
+static void uvTcpClose(struct raft_uv_transport *transport,
+		       raft_uv_transport_close_cb cb)
+{
+	struct UvTcp *t = transport->impl;
+	assert(!t->closing);
+	t->closing = true;
+	t->close_cb = cb;
+	UvTcpListenClose(t);
+	UvTcpConnectClose(t);
+	UvTcpMaybeFireCloseCb(t);
+}
+
+void UvTcpMaybeFireCloseCb(struct UvTcp *t)
+{
+	if (!t->closing) {
+		return;
+	}
+
+	assert(QUEUE_IS_EMPTY(&t->accepting));
+	assert(QUEUE_IS_EMPTY(&t->connecting));
+	if (!QUEUE_IS_EMPTY(&t->aborting)) {
+		return;
+	}
+
+	if (t->listeners != NULL) {
+		return;
+	}
+
+	if (t->close_cb != NULL) {
+		t->close_cb(t->transport);
+	}
+}
+
+int raft_uv_tcp_init(struct raft_uv_transport *transport,
+		     struct uv_loop_s *loop)
+{
+	struct UvTcp *t;
+	void *data = transport->data;
+	int version = transport->version;
+	if (version != 1) {
+		ErrMsgPrintf(transport->errmsg, "Invalid version: %d", version);
+		return RAFT_INVALID;
+	}
+
+	memset(transport, 0, sizeof *transport);
+	transport->data = data;
+	transport->version = version;
+	t = raft_malloc(sizeof *t);
+	if (t == NULL) {
+		ErrMsgOom(transport->errmsg);
+		return RAFT_NOMEM;
+	}
+	t->transport = transport;
+	t->loop = loop;
+	t->id = 0;
+	t->address = NULL;
+	t->bind_address = NULL;
+	t->listeners = NULL;
+	t->n_listeners = 0;
+	t->accept_cb = NULL;
+	QUEUE_INIT(&t->accepting);
+	QUEUE_INIT(&t->connecting);
+	QUEUE_INIT(&t->aborting);
+	t->closing = false;
+	t->close_cb = NULL;
+
+	transport->impl = t;
+	transport->init = uvTcpInit;
+	transport->close = uvTcpClose;
+	transport->listen = UvTcpListen;
+	transport->connect = UvTcpConnect;
+
+	return 0;
+}
+
+void raft_uv_tcp_close(struct raft_uv_transport *transport)
+{
+	struct UvTcp *t = transport->impl;
+	raft_free(t->bind_address);
+	raft_free(t);
+}
+
+int raft_uv_tcp_set_bind_address(struct raft_uv_transport *transport,
+				 const char *address)
+{
+	struct UvTcp *t = transport->impl;
+	char hostname[NI_MAXHOST];
+	char service[NI_MAXSERV];
+	int rv;
+
+	rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service,
+			   sizeof(service));
+	if (rv != 0) {
+		return RAFT_INVALID;
+	}
+
+	t->bind_address = raft_malloc(strlen(address) + 1);
+	if (t->bind_address == NULL) {
+		return RAFT_NOMEM;
+	}
+	strcpy(t->bind_address, address);
+	return 0;
+}
diff --git a/src/raft/uv_tcp.h b/src/raft/uv_tcp.h
new file mode 100644
index 000000000..924f4f5b3
--- /dev/null
+++ b/src/raft/uv_tcp.h
@@ -0,0 +1,48 @@
+#ifndef UV_TCP_H_
+#define UV_TCP_H_
+
+#include "../raft.h"
+#include "queue.h"
+
+/* Protocol version. */
+#define UV__TCP_HANDSHAKE_PROTOCOL 1
+
+struct UvTcp
+{
+	struct raft_uv_transport *transport; /* Interface object we implement */
+	struct uv_loop_s *loop;              /* Event loop */
+	raft_id id;                          /* ID of this raft server */
+	const char *address;                 /* Address of this raft server */
+	unsigned n_listeners;                /* Number of listener sockets */
+	struct uv_tcp_s *listeners;          /* Listener sockets */
+	raft_uv_accept_cb accept_cb; /* Call after accepting a connection */
+	queue accepting;             /* Connections being accepted */
+	queue connecting;            /* Pending connection requests */
+	queue aborting;              /* Connections being aborted */
+	bool closing;                /* True after close() is called */
+	raft_uv_transport_close_cb
+	    close_cb;       /* Call when it's safe to free us */
+	char *bind_address; /* Optional address:port to bind to */
+};
+
+/* Implementation of raft_uv_transport->listen. */
+int UvTcpListen(struct raft_uv_transport *transport, raft_uv_accept_cb cb);
+
+/* Stop accepting new connection and close all connections being accepted. */
+void UvTcpListenClose(struct UvTcp *t);
+
+/* Implementation of raft_uv_transport->connect. */
+int UvTcpConnect(struct raft_uv_transport *transport,
+		 struct raft_uv_connect *req,
+		 raft_id id,
+		 const char *address,
+		 raft_uv_connect_cb cb);
+
+/* Abort all pending connection requests. */
+void UvTcpConnectClose(struct UvTcp *t);
+
+/* Fire the transport close callback if the transport is closing and there's no
+ * more pending callback. */
+void UvTcpMaybeFireCloseCb(struct UvTcp *t);
+
+#endif /* UV_TCP_H_ */
diff --git a/src/raft/uv_tcp_connect.c b/src/raft/uv_tcp_connect.c
new file mode 100644
index 000000000..e493d14a8
--- /dev/null
+++ b/src/raft/uv_tcp_connect.c
@@ -0,0 +1,382 @@
+#include <string.h>
+
+#include "assert.h"
+#include "byte.h"
+#include "err.h"
+#include "heap.h"
+#include "uv_ip.h"
+#include "uv_tcp.h"
+
+/* The happy path of a connection request is:
+ *
+ * - Create a TCP handle and submit a TCP connect request.
+ * - Initiate an asynchronous dns resolve request
+ * - Once the name lookup was successfull connect to the first given IP
+ * - Once connected over TCP, submit a write request for the handshake.
+ * - Once the write completes, fire the connection request callback.
+ *
+ * Alternative happy path of a connection request, if hostname resolves to
+ * multiple IPs and first/second/... IP is reachable:
+ * - close the tcp handle and initiate a new connect with next IP in cb
+ *
+ * Possible failure modes are:
+ *
+ * - The name resolve for the hostname is not sucessfull, close the TCP handle
+ *   and fire the request callback.
+ *
+ * - The transport get closed, close the TCP handle and and fire the request
+ *   callback with RAFT_CANCELED.
+ *
+ * - Either the TCP connect or the write request fails: close the TCP handle and
+ *   fire the request callback with RAFT_NOCONNECTION.
+ */
+
+/* Hold state for a single connection request. */
+struct uvTcpConnect
+{
+	struct UvTcp *t;                     /* Transport implementation */
+	struct raft_uv_connect *req;         /* User request */
+	uv_buf_t handshake;                  /* Handshake data */
+	struct uv_tcp_s *tcp;                /* TCP connection socket handle */
+	struct uv_getaddrinfo_s getaddrinfo; /* DNS resolve request */
+	const struct addrinfo
+	    *ai_current;             /* The current sockaddr to connect to */
+	struct uv_connect_s connect; /* TCP connection request */
+	struct uv_write_s write;     /* TCP handshake request */
+	int status;                  /* Returned to the request callback */
+	bool resolving;              /* Indicate name resolving in progress */
+	bool retry;                  /* Indicate tcp connect failure handling */
+	queue queue;                 /* Pending connect queue */
+};
+
+/* Encode an handshake message into the given buffer. */
+static int uvTcpEncodeHandshake(raft_id id, const char *address, uv_buf_t *buf)
+{
+	void *cursor;
+	size_t address_len = bytePad64(strlen(address) + 1);
+	buf->len = sizeof(uint64_t) + /* Protocol version. */
+		   sizeof(uint64_t) + /* Server ID. */
+		   sizeof(uint64_t) /* Size of the address buffer */;
+	buf->len += address_len;
+	buf->base = RaftHeapMalloc(buf->len);
+	if (buf->base == NULL) {
+		return RAFT_NOMEM;
+	}
+	cursor = buf->base;
+	bytePut64(&cursor, UV__TCP_HANDSHAKE_PROTOCOL);
+	bytePut64(&cursor, id);
+	bytePut64(&cursor, address_len);
+	strcpy(cursor, address);
+	return 0;
+}
+
+/* Finish the connect request, releasing its memory and firing the connect
+ * callback. */
+static void uvTcpConnectFinish(struct uvTcpConnect *connect)
+{
+	struct uv_stream_s *stream = (struct uv_stream_s *)connect->tcp;
+	struct raft_uv_connect *req = connect->req;
+	int status = connect->status;
+	QUEUE_REMOVE(&connect->queue);
+	RaftHeapFree(connect->handshake.base);
+	uv_freeaddrinfo(connect->getaddrinfo.addrinfo);
+	raft_free(connect);
+	req->cb(req, stream, status);
+}
+
+/* The TCP connection handle has been closed in consequence of an error or
+ * because the transport is closing. */
+static void uvTcpConnectUvCloseCb(struct uv_handle_s *handle)
+{
+	struct uvTcpConnect *connect = handle->data;
+	struct UvTcp *t = connect->t;
+	assert(connect->status != 0);
+	assert(handle == (struct uv_handle_s *)connect->tcp);
+	RaftHeapFree(connect->tcp);
+	connect->tcp = NULL;
+	uvTcpConnectFinish(connect);
+	UvTcpMaybeFireCloseCb(t);
+}
+
+/* Abort a connection request. */
+static void uvTcpConnectAbort(struct uvTcpConnect *connect)
+{
+	QUEUE_REMOVE(&connect->queue);
+	QUEUE_PUSH(&connect->t->aborting, &connect->queue);
+	uv_cancel((struct uv_req_s *)&connect->getaddrinfo);
+	/* Call uv_close on the tcp handle, if there is no getaddrinfo request
+	 * in flight and the handle is not currently closed due to next IP
+	 * connect attempt.
+	 * Data structures may only be freed after the uvGetAddrInfoCb was
+	 * triggered. Tcp handle will be closed in the uvGetAddrInfoCb in this
+	 * case. uvTcpConnectUvCloseCb will be invoked from
+	 * uvTcpTryNextConnectCb in case a next IP connect should be started. */
+	if (!connect->resolving && !connect->retry) {
+		uv_close((struct uv_handle_s *)connect->tcp,
+			 uvTcpConnectUvCloseCb);
+	}
+}
+
+/* The handshake TCP write completes. Fire the connect callback. */
+static void uvTcpConnectUvWriteCb(struct uv_write_s *write, int status)
+{
+	struct uvTcpConnect *connect = write->data;
+	struct UvTcp *t = connect->t;
+
+	if (t->closing) {
+		connect->status = RAFT_CANCELED;
+		return;
+	}
+
+	if (status != 0) {
+		assert(status !=
+		       UV_ECANCELED); /* t->closing would have been true */
+		connect->status = RAFT_NOCONNECTION;
+		uvTcpConnectAbort(connect);
+		return;
+	}
+
+	uvTcpConnectFinish(connect);
+}
+
+/* Helper function to connect to the remote node */
+static void uvTcpAsyncConnect(struct uvTcpConnect *connect);
+
+/* The TCP connect failed, we closed the handle and want to try with next IP */
+static void uvTcpTryNextConnectCb(struct uv_handle_s *handle)
+{
+	struct uvTcpConnect *connect = handle->data;
+	struct UvTcp *t = connect->t;
+	int rv;
+
+	connect->retry = false;
+
+	if (t->closing) {
+		connect->status = RAFT_CANCELED;
+		/* We are already in close cb for the tcp handle, simply invoke
+		 * final cb
+		 */
+		uvTcpConnectUvCloseCb(handle);
+		return;
+	}
+	rv = uv_tcp_init(t->loop, connect->tcp);
+	assert(rv == 0);
+	uvTcpAsyncConnect(connect);
+}
+
+/* The TCP connection is established. Write the handshake data. */
+static void uvTcpConnectUvConnectCb(struct uv_connect_s *req, int status)
+{
+	struct uvTcpConnect *connect = req->data;
+	struct UvTcp *t = connect->t;
+	int rv;
+
+	if (t->closing) {
+		connect->status = RAFT_CANCELED;
+		return;
+	}
+
+	if (status != 0) {
+		assert(status !=
+		       UV_ECANCELED); /* t->closing would have been true */
+		connect->ai_current = connect->ai_current->ai_next;
+		if (connect->ai_current) {
+			/* For the next connect attempt we need to close the tcp
+			 * handle. */
+			/* To avoid interference with aborting we set a flag to
+			 * indicate the connect attempt */
+			connect->retry = true;
+			uv_close((struct uv_handle_s *)connect->tcp,
+				 uvTcpTryNextConnectCb);
+			return;
+		}
+		connect->status = RAFT_NOCONNECTION;
+		ErrMsgPrintf(t->transport->errmsg, "uv_tcp_connect(): %s",
+			     uv_strerror(status));
+		goto err;
+	}
+
+	rv = uv_write(&connect->write, (struct uv_stream_s *)connect->tcp,
+		      &connect->handshake, 1, uvTcpConnectUvWriteCb);
+	if (rv != 0) {
+		/* UNTESTED: what are the error conditions? perhaps ENOMEM */
+		connect->status = RAFT_NOCONNECTION;
+		goto err;
+	}
+
+	return;
+
+err:
+	uvTcpConnectAbort(connect);
+}
+
+/* Helper function to connect to the remote node */
+static void uvTcpAsyncConnect(struct uvTcpConnect *connect)
+{
+	int rv;
+	rv = uv_tcp_connect(&connect->connect, connect->tcp,
+			    connect->ai_current->ai_addr,
+			    uvTcpConnectUvConnectCb);
+	if (rv != 0) {
+		/* UNTESTED: since parsing succeed, this should fail only
+		 * because of lack of system resources */
+		ErrMsgPrintf(connect->t->transport->errmsg,
+			     "uv_tcp_connect(): %s", uv_strerror(rv));
+		connect->status = RAFT_NOCONNECTION;
+		uvTcpConnectAbort(connect);
+	}
+}
+
+/* The hostname resolve is finished */
+static void uvGetAddrInfoCb(uv_getaddrinfo_t *req,
+			    int status,
+			    struct addrinfo *res)
+{
+	struct uvTcpConnect *connect = req->data;
+	struct UvTcp *t = connect->t;
+
+	connect->resolving =
+	    false; /* Indicate we are in the name resolving phase */
+
+	if (t->closing) {
+		connect->status = RAFT_CANCELED;
+
+		/* We need to close the tcp handle to abort connection attempt
+		 */
+		uv_close((struct uv_handle_s *)connect->tcp,
+			 uvTcpConnectUvCloseCb);
+		return;
+	}
+
+	if (status < 0) {
+		ErrMsgPrintf(t->transport->errmsg, "uv_getaddrinfo(): %s",
+			     uv_err_name(status));
+		connect->status = RAFT_NOCONNECTION;
+		uvTcpConnectAbort(connect);
+		return;
+	}
+	connect->ai_current = res;
+	uvTcpAsyncConnect(connect);
+}
+/* Create a new TCP handle and submit a connection request to the event loop. */
+static int uvTcpConnectStart(struct uvTcpConnect *r, const char *address)
+{
+	static struct addrinfo hints = {.ai_flags = AI_V4MAPPED | AI_ADDRCONFIG,
+					.ai_family = AF_INET,
+					.ai_socktype = SOCK_STREAM,
+					.ai_protocol = 0};
+	struct UvTcp *t = r->t;
+	char hostname[NI_MAXHOST];
+	char service[NI_MAXSERV];
+	int rv;
+
+	r->handshake.base = NULL;
+
+	/* Initialize the handshake buffer. */
+	rv = uvTcpEncodeHandshake(t->id, t->address, &r->handshake);
+	if (rv != 0) {
+		assert(rv == RAFT_NOMEM);
+		ErrMsgOom(t->transport->errmsg);
+		goto err;
+	}
+
+	r->tcp = RaftHeapMalloc(sizeof *r->tcp);
+	if (r->tcp == NULL) {
+		ErrMsgOom(t->transport->errmsg);
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	rv = uv_tcp_init(r->t->loop, r->tcp);
+	assert(rv == 0);
+	r->tcp->data = r;
+
+	rv = uvIpAddrSplit(address, hostname, sizeof(hostname), service,
+			   sizeof(service));
+	if (rv) {
+		ErrMsgPrintf(
+		    t->transport->errmsg,
+		    "uv_tcp_connect(): Cannot split %s into host and service",
+		    address);
+		rv = RAFT_NOCONNECTION;
+		goto err_after_tcp_init;
+	}
+	rv = uv_getaddrinfo(r->t->loop, &r->getaddrinfo, &uvGetAddrInfoCb,
+			    hostname, service, &hints);
+	if (rv) {
+		ErrMsgPrintf(t->transport->errmsg,
+			     "uv_tcp_connect(): Cannot initiate getaddrinfo %s",
+			     uv_strerror(rv));
+		rv = RAFT_NOCONNECTION;
+		goto err_after_tcp_init;
+	}
+	r->resolving = true; /* Indicate we are in the name resolving phase */
+
+	return 0;
+
+err_after_tcp_init:
+	uv_close((uv_handle_t *)r->tcp, (uv_close_cb)RaftHeapFree);
+
+err:
+	RaftHeapFree(r->handshake.base);
+
+	return rv;
+}
+
+int UvTcpConnect(struct raft_uv_transport *transport,
+		 struct raft_uv_connect *req,
+		 raft_id id,
+		 const char *address,
+		 raft_uv_connect_cb cb)
+{
+	struct UvTcp *t = transport->impl;
+	struct uvTcpConnect *r;
+	int rv;
+	(void)id;
+	assert(!t->closing);
+
+	/* Create and initialize a new TCP connection request object */
+	r = RaftHeapMalloc(sizeof *r);
+	if (r == NULL) {
+		rv = RAFT_NOMEM;
+		ErrMsgOom(transport->errmsg);
+		goto err;
+	}
+	r->t = t;
+	r->req = req;
+	r->status = 0;
+	r->write.data = r;
+	r->getaddrinfo.data = r;
+	r->resolving = false;
+	r->retry = false;
+	r->connect.data = r;
+	req->cb = cb;
+
+	/* Keep track of the pending request */
+	QUEUE_PUSH(&t->connecting, &r->queue);
+
+	/* Start connecting */
+	rv = uvTcpConnectStart(r, address);
+	if (rv != 0) {
+		goto err_after_alloc;
+	}
+
+	return 0;
+
+err_after_alloc:
+	QUEUE_REMOVE(&r->queue);
+	RaftHeapFree(r);
+err:
+	return rv;
+}
+
+void UvTcpConnectClose(struct UvTcp *t)
+{
+	while (!QUEUE_IS_EMPTY(&t->connecting)) {
+		struct uvTcpConnect *connect;
+		queue *head;
+		head = QUEUE_HEAD(&t->connecting);
+		connect = QUEUE_DATA(head, struct uvTcpConnect, queue);
+		uvTcpConnectAbort(connect);
+	}
+}
diff --git a/src/raft/uv_tcp_listen.c b/src/raft/uv_tcp_listen.c
new file mode 100644
index 000000000..41b6ca1ad
--- /dev/null
+++ b/src/raft/uv_tcp_listen.c
@@ -0,0 +1,427 @@
+#include <string.h>
+
+#include "assert.h"
+#include "byte.h"
+#include "heap.h"
+#include "uv_ip.h"
+#include "uv_tcp.h"
+
+/* The happy path of an incoming connection is:
+ *
+ * - The connection callback is fired on the listener TCP handle, and the
+ *   incoming connection is uv_accept()'ed. We call uv_read_start() to get
+ *   notified about received handshake data.
+ *
+ * - Once the preamble is received, we start waiting for the server address.
+ *
+ * - Once the server address is received, we fire the receive callback.
+ *
+ * Possible failure modes are:
+ *
+ * - The accept process gets canceled in the transport->close() implementation,
+ *   by calling tcp_accept_stop(): the incoming TCP connection handle gets
+ *   closed, preventing any further handshake data notification, and all
+ *   allocated memory gets released in the handle close callback.
+ */
+
+/* Hold state for a connection being accepted. */
+struct uvTcpHandshake
+{
+	uint64_t preamble[3]; /* Preamble buffer */
+	uv_buf_t address;     /* Address buffer */
+	size_t nread;         /* Number of bytes read */
+};
+
+/* Hold handshake data for a new connection being established. */
+struct uvTcpIncoming
+{
+	struct UvTcp *t; /* Transport implementation */
+	struct uv_tcp_s
+	    *listener;        /* The tcp handle, which accepted this socket */
+	struct uv_tcp_s *tcp; /* TCP connection socket handle */
+	struct uvTcpHandshake handshake; /* Handshake data */
+	queue queue;                     /* Pending accept queue */
+};
+
+/* Decode the handshake preamble, containing the protocol version, the ID of the
+ * connecting server and the length of its address. Also, allocate the buffer to
+ * start reading the server address. */
+static int uvTcpDecodePreamble(struct uvTcpHandshake *h)
+{
+	uint64_t protocol;
+	protocol = byteFlip64(h->preamble[0]);
+	if (protocol != UV__TCP_HANDSHAKE_PROTOCOL) {
+		return RAFT_MALFORMED;
+	}
+	h->address.len = (size_t)byteFlip64(h->preamble[2]);
+	h->address.base = RaftHeapMalloc(h->address.len);
+	if (h->address.base == NULL) {
+		return RAFT_NOMEM;
+	}
+	h->nread = 0;
+	return 0;
+}
+
+/* The accepted TCP client connection has been closed, release all memory
+ * associated with accept object. We can get here only if an error occurrent
+ * during the handshake or if raft_uv_transport->close() has been invoked. */
+static void uvTcpIncomingCloseCb(struct uv_handle_s *handle)
+{
+	struct uvTcpIncoming *incoming = handle->data;
+	struct UvTcp *t = incoming->t;
+	QUEUE_REMOVE(&incoming->queue);
+	if (incoming->handshake.address.base != NULL) {
+		RaftHeapFree(incoming->handshake.address.base);
+	}
+	RaftHeapFree(incoming->tcp);
+	RaftHeapFree(incoming);
+	UvTcpMaybeFireCloseCb(t);
+}
+
+/* Close an incoming TCP connection which hasn't complete the handshake yet. */
+static void uvTcpIncomingAbort(struct uvTcpIncoming *incoming)
+{
+	struct UvTcp *t = incoming->t;
+	/* After uv_close() returns we are guaranteed that no more alloc_cb or
+	 * read_cb will be called. */
+	QUEUE_REMOVE(&incoming->queue);
+	QUEUE_PUSH(&t->aborting, &incoming->queue);
+	uv_close((struct uv_handle_s *)incoming->tcp, uvTcpIncomingCloseCb);
+}
+
+/* Read the address part of the handshake. */
+static void uvTcpIncomingAllocCbAddress(struct uv_handle_s *handle,
+					size_t suggested_size,
+					uv_buf_t *buf)
+{
+	struct uvTcpIncoming *incoming = handle->data;
+	(void)suggested_size;
+	assert(!incoming->t->closing);
+	buf->base =
+	    incoming->handshake.address.base + incoming->handshake.nread;
+	buf->len = incoming->handshake.address.len - incoming->handshake.nread;
+}
+
+static void uvTcpIncomingReadCbAddress(uv_stream_t *stream,
+				       ssize_t nread,
+				       const uv_buf_t *buf)
+{
+	struct uvTcpIncoming *incoming = stream->data;
+	char *address;
+	raft_id id;
+	size_t n;
+	int rv;
+
+	(void)buf;
+	assert(!incoming->t->closing);
+
+	if (nread == 0) {
+		/* Empty read just ignore it. */
+		return;
+	}
+	if (nread < 0) {
+		uvTcpIncomingAbort(incoming);
+		return;
+	}
+
+	/* We shouldn't have read more data than the pending amount. */
+	n = (size_t)nread;
+	assert(n <=
+	       incoming->handshake.address.len - incoming->handshake.nread);
+
+	/* Advance the read window */
+	incoming->handshake.nread += n;
+
+	/* If there's more data to read in order to fill the current
+	 * read buffer, just return, we'll be invoked again. */
+	if (incoming->handshake.nread < incoming->handshake.address.len) {
+		return;
+	}
+
+	/* If we have completed reading the address, let's fire the callback. */
+	rv = uv_read_stop(stream);
+	assert(rv == 0);
+	id = byteFlip64(incoming->handshake.preamble[1]);
+	address = incoming->handshake.address.base;
+	QUEUE_REMOVE(&incoming->queue);
+	incoming->t->accept_cb(incoming->t->transport, id, address,
+			       (struct uv_stream_s *)incoming->tcp);
+	RaftHeapFree(incoming->handshake.address.base);
+	RaftHeapFree(incoming);
+}
+
+/* Read the preamble of the handshake. */
+static void uvTcpIncomingAllocCbPreamble(struct uv_handle_s *handle,
+					 size_t suggested_size,
+					 uv_buf_t *buf)
+{
+	struct uvTcpIncoming *incoming = handle->data;
+	(void)suggested_size;
+	buf->base =
+	    (char *)incoming->handshake.preamble + incoming->handshake.nread;
+	buf->len =
+	    sizeof incoming->handshake.preamble - incoming->handshake.nread;
+}
+
+static void uvTcpIncomingReadCbPreamble(uv_stream_t *stream,
+					ssize_t nread,
+					const uv_buf_t *buf)
+{
+	struct uvTcpIncoming *incoming = stream->data;
+	size_t n;
+	int rv;
+
+	(void)buf;
+
+	if (nread == 0) {
+		/* Empty read just ignore it. */
+		return;
+	}
+	if (nread < 0) {
+		uvTcpIncomingAbort(incoming);
+		return;
+	}
+
+	/* We shouldn't have read more data than the pending amount. */
+	n = (size_t)nread;
+	assert(n <=
+	       sizeof incoming->handshake.preamble - incoming->handshake.nread);
+
+	/* Advance the read window */
+	incoming->handshake.nread += n;
+
+	/* If there's more data to read in order to fill the current
+	 * read buffer, just return, we'll be invoked again. */
+	if (incoming->handshake.nread < sizeof incoming->handshake.preamble) {
+		return;
+	}
+
+	/* If we have completed reading the preamble, let's parse it. */
+	rv = uvTcpDecodePreamble(&incoming->handshake);
+	if (rv != 0) {
+		uvTcpIncomingAbort(incoming);
+		return;
+	}
+
+	rv = uv_read_stop(stream);
+	assert(rv == 0);
+	rv = uv_read_start((uv_stream_t *)incoming->tcp,
+			   uvTcpIncomingAllocCbAddress,
+			   uvTcpIncomingReadCbAddress);
+	assert(rv == 0);
+}
+
+/* Start reading handshake data for a new incoming connection. */
+static int uvTcpIncomingStart(struct uvTcpIncoming *incoming)
+{
+	int rv;
+
+	memset(&incoming->handshake, 0, sizeof incoming->handshake);
+
+	incoming->tcp = RaftHeapMalloc(sizeof *incoming->tcp);
+	if (incoming->tcp == NULL) {
+		return RAFT_NOMEM;
+	}
+	incoming->tcp->data = incoming;
+
+	rv = uv_tcp_init(incoming->t->loop, incoming->tcp);
+	assert(rv == 0);
+
+	rv = uv_accept((struct uv_stream_s *)incoming->listener,
+		       (struct uv_stream_s *)incoming->tcp);
+	if (rv != 0) {
+		rv = RAFT_IOERR;
+		goto err_after_tcp_init;
+	}
+	rv = uv_read_start((uv_stream_t *)incoming->tcp,
+			   uvTcpIncomingAllocCbPreamble,
+			   uvTcpIncomingReadCbPreamble);
+	assert(rv == 0);
+
+	return 0;
+
+err_after_tcp_init:
+	uv_close((uv_handle_t *)incoming->tcp, (uv_close_cb)RaftHeapFree);
+	return rv;
+}
+
+#define IS_IN_ARRAY(elem, array, array_size)             \
+	(const char *)(elem) >= (const char *)(array) && \
+	    (const char *)(elem) <                       \
+		(const char *)(array) + array_size * sizeof(*array)
+
+/* Called when there's a new incoming connection: create a new tcp_accept object
+ * and start receiving handshake data. */
+static void uvTcpListenCb(struct uv_stream_s *stream, int status)
+{
+	struct UvTcp *t = stream->data;
+	struct uvTcpIncoming *incoming;
+	int rv;
+
+	assert(IS_IN_ARRAY(stream, t->listeners, t->n_listeners));
+
+	if (status != 0) {
+		rv = RAFT_IOERR;
+		goto err;
+	}
+
+	incoming = RaftHeapMalloc(sizeof *incoming);
+	if (incoming == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+	incoming->t = t;
+	incoming->listener = (struct uv_tcp_s *)stream;
+	incoming->tcp = NULL;
+
+	QUEUE_PUSH(&t->accepting, &incoming->queue);
+
+	rv = uvTcpIncomingStart(incoming);
+	if (rv != 0) {
+		goto err_after_accept_alloc;
+	}
+
+	return;
+
+err_after_accept_alloc:
+	QUEUE_REMOVE(&incoming->queue);
+	RaftHeapFree(incoming);
+err:
+	assert(rv != 0);
+}
+
+/* Do bind/listen call on the tcp handle */
+static int uvTcpBindListen(struct uv_tcp_s *listener, struct sockaddr *addr)
+{
+	if (uv_tcp_bind(listener, addr, 0) ||
+	    uv_listen((uv_stream_t *)listener, 1, uvTcpListenCb)) {
+		return RAFT_IOERR;
+	}
+	return 0;
+}
+
+/* Create a tcp handle and do bind/listen for each IP */
+static int uvTcpListenOnMultipleIP(struct raft_uv_transport *transport,
+				   struct addrinfo *addr_infos)
+{
+	struct UvTcp *t;
+	struct addrinfo *current;
+	unsigned n_listeners;
+	int rv;
+
+	t = transport->impl;
+
+	n_listeners = 0;
+	for (current = addr_infos; current; current = current->ai_next) {
+		++n_listeners;
+	}
+
+	current = addr_infos;
+	t->listeners = raft_malloc(n_listeners * sizeof(*t->listeners));
+	if (!t->listeners) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	t->n_listeners = n_listeners;
+	for (n_listeners = 0; n_listeners < t->n_listeners; ++n_listeners) {
+		struct uv_tcp_s *listener = &t->listeners[n_listeners];
+		listener->data = t;
+		if (uv_tcp_init(t->loop, listener) ||
+		    uvTcpBindListen(listener, current->ai_addr)) {
+			rv = RAFT_IOERR;
+			goto err;
+		}
+		current = addr_infos->ai_next;
+	}
+	return 0;
+
+err:
+	if (t->listeners) {
+		for (unsigned i = 0; i <= n_listeners; ++i) {
+			uv_close((struct uv_handle_s *)&t->listeners[i], NULL);
+		}
+		raft_free(t->listeners);
+		t->listeners = NULL;
+		t->n_listeners = 0;
+	}
+	return rv;
+}
+
+/* Ignore duplicate entries from glibc getaddrinfo due to
+ * https://bugzilla.redhat.com/show_bug.cgi?id=496300
+ * in case of resolving localhost */
+static bool uvIsAddressDuplication(struct addrinfo *addr_info)
+{
+	struct addrinfo *next = addr_info->ai_next;
+
+	/* Check, if we have a list of length 2 */
+	if (!next || next->ai_next) {
+		return false;
+	}
+	if (addr_info->ai_addrlen != next->ai_addrlen ||
+	    bcmp(addr_info->ai_addr, next->ai_addr, addr_info->ai_addrlen)) {
+		return false;
+	}
+	return true;
+}
+
+int UvTcpListen(struct raft_uv_transport *transport, raft_uv_accept_cb cb)
+{
+	struct UvTcp *t;
+	struct addrinfo *addr_infos;
+	int rv;
+
+	t = transport->impl;
+	t->accept_cb = cb;
+
+	if (t->bind_address == NULL) {
+		rv = uvIpResolveBindAddresses(t->address, &addr_infos);
+	} else {
+		rv = uvIpResolveBindAddresses(t->bind_address, &addr_infos);
+	}
+	if (rv != 0 || !addr_infos) {
+		return rv;
+	}
+	if (addr_infos->ai_next && uvIsAddressDuplication(addr_infos)) {
+		rv = uvTcpListenOnMultipleIP(transport, addr_infos->ai_next);
+	} else {
+		rv = uvTcpListenOnMultipleIP(transport, addr_infos);
+	}
+	freeaddrinfo(addr_infos);
+	return rv;
+}
+
+/* Close callback for uvTcp->listener. */
+static void uvTcpListenCloseCbListener(struct uv_handle_s *handle)
+{
+	struct UvTcp *t = handle->data;
+	assert(t->closing);
+	assert(t->n_listeners);
+	assert(t->listeners);
+	if (--t->n_listeners == 0) {
+		raft_free(t->listeners);
+		t->listeners = NULL;
+		UvTcpMaybeFireCloseCb(t);
+	}
+}
+
+void UvTcpListenClose(struct UvTcp *t)
+{
+	queue *head;
+	assert(t->closing);
+
+	while (!QUEUE_IS_EMPTY(&t->accepting)) {
+		struct uvTcpIncoming *incoming;
+		head = QUEUE_HEAD(&t->accepting);
+		incoming = QUEUE_DATA(head, struct uvTcpIncoming, queue);
+		uvTcpIncomingAbort(incoming);
+	}
+
+	if (t->n_listeners) {
+		for (unsigned i = 0; i < t->n_listeners; ++i) {
+			uv_close((struct uv_handle_s *)&t->listeners[i],
+				 uvTcpListenCloseCbListener);
+		}
+	}
+}
diff --git a/src/raft/uv_truncate.c b/src/raft/uv_truncate.c
new file mode 100644
index 000000000..51bd84fcb
--- /dev/null
+++ b/src/raft/uv_truncate.c
@@ -0,0 +1,200 @@
+#include <string.h>
+#include <unistd.h>
+
+#include "assert.h"
+#include "byte.h"
+#include "heap.h"
+#include "uv.h"
+#include "uv_encoding.h"
+
+/* Track a truncate request. */
+struct uvTruncate
+{
+	struct uv *uv;
+	struct UvBarrierReq barrier;
+	raft_index index;
+	int status;
+};
+
+/* Execute a truncate request in a thread. */
+static void uvTruncateWorkCb(uv_work_t *work)
+{
+	struct uvTruncate *truncate = work->data;
+	struct uv *uv = truncate->uv;
+	tracef("uv truncate work cb");
+	struct uvSnapshotInfo *snapshots;
+	struct uvSegmentInfo *segments;
+	struct uvSegmentInfo *segment;
+	size_t n_snapshots;
+	size_t n_segments;
+	size_t i;
+	size_t j;
+	char errmsg[RAFT_ERRMSG_BUF_SIZE];
+	int rv;
+
+	/* Load all segments on disk. */
+	rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments,
+		    errmsg);
+	if (rv != 0) {
+		goto err;
+	}
+	if (snapshots != NULL) {
+		RaftHeapFree(snapshots);
+	}
+	assert(segments != NULL);
+
+	/* Find the segment that contains the truncate point. */
+	segment = NULL; /* Suppress warnings. */
+	for (i = 0; i < n_segments; i++) {
+		segment = &segments[i];
+		if (segment->is_open) {
+			continue;
+		}
+		if (truncate->index >= segment->first_index &&
+		    truncate->index <= segment->end_index) {
+			break;
+		}
+	}
+	assert(i < n_segments);
+
+	/* If the truncate index is not the first of the segment, we need to
+	 * truncate it. */
+	if (truncate->index > segment->first_index) {
+		rv = uvSegmentTruncate(uv, segment, truncate->index);
+		if (rv != 0) {
+			goto err_after_list;
+		}
+	}
+
+	/* Remove all closed segments past the one containing the truncate
+	 * index. */
+	for (j = i; j < n_segments; j++) {
+		segment = &segments[j];
+		if (segment->is_open) {
+			continue;
+		}
+		rv = UvFsRemoveFile(uv->dir, segment->filename, errmsg);
+		if (rv != 0) {
+			tracef("unlink segment %s: %s", segment->filename,
+			       errmsg);
+			rv = RAFT_IOERR;
+			goto err_after_list;
+		}
+	}
+	rv = UvFsSyncDir(uv->dir, errmsg);
+	if (rv != 0) {
+		tracef("sync data directory: %s", errmsg);
+		rv = RAFT_IOERR;
+		goto err_after_list;
+	}
+
+	RaftHeapFree(segments);
+	truncate->status = 0;
+
+	tracef("uv truncate work cb ok");
+	return;
+
+err_after_list:
+	RaftHeapFree(segments);
+err:
+	assert(rv != 0);
+	truncate->status = rv;
+}
+
+static void uvTruncateAfterWorkCb(uv_work_t *work, int status)
+{
+	assert(work != NULL);
+	struct uvTruncate *truncate = work->data;
+	assert(truncate != NULL);
+	struct uv *uv = truncate->uv;
+	assert(uv != NULL);
+	tracef("uv truncate after work cb status:%d", status);
+	assert(status == 0);
+	if (truncate->status != 0) {
+		uv->errored = true;
+	}
+	tracef("clear truncate work");
+	uv->truncate_work.data = NULL;
+	RaftHeapFree(truncate);
+	UvUnblock(uv);
+}
+
+static void uvTruncateBarrierCb(struct UvBarrierReq *barrier)
+{
+	struct uvTruncate *truncate = barrier->data;
+	struct uv *uv = truncate->uv;
+	tracef("uv truncate barrier cb");
+	int rv;
+
+	/* Ensure that we don't invoke this callback more than once. */
+	barrier->cb = NULL;
+
+	/* If we're closing, don't perform truncation at all and abort here. */
+	if (uv->closing) {
+		tracef("closing => don't truncate");
+		RaftHeapFree(truncate);
+		uvMaybeFireCloseCb(uv);
+		return;
+	}
+
+	assert(QUEUE_IS_EMPTY(&uv->append_writing_reqs));
+	assert(QUEUE_IS_EMPTY(&uv->finalize_reqs));
+	assert(uv->finalize_work.data == NULL);
+	assert(uv->truncate_work.data == NULL);
+
+	tracef("set truncate work");
+	uv->truncate_work.data = truncate;
+	rv = uv_queue_work(uv->loop, &uv->truncate_work, uvTruncateWorkCb,
+			   uvTruncateAfterWorkCb);
+	if (rv != 0) {
+		tracef("truncate index %lld: %s", truncate->index,
+		       uv_strerror(rv));
+		tracef("clear truncate work");
+		uv->truncate_work.data = NULL;
+		uv->errored = true;
+	}
+}
+
+int UvTruncate(struct raft_io *io, raft_index index)
+{
+	struct uv *uv;
+	struct uvTruncate *truncate;
+	int rv;
+
+	uv = io->impl;
+	tracef("uv truncate %llu", index);
+	assert(!uv->closing);
+
+	/* We should truncate only entries that we were requested to append in
+	 * the first place. */
+	assert(index > 0);
+	assert(index < uv->append_next_index);
+
+	truncate = RaftHeapMalloc(sizeof *truncate);
+	if (truncate == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+	truncate->uv = uv;
+	truncate->index = index;
+	truncate->barrier.data = truncate;
+	truncate->barrier.blocking = true;
+	truncate->barrier.cb = uvTruncateBarrierCb;
+
+	/* Make sure that we wait for any inflight writes to finish and then
+	 * close the current segment. */
+	rv = UvBarrier(uv, index, &truncate->barrier);
+	if (rv != 0) {
+		goto err_after_req_alloc;
+	}
+
+	return 0;
+
+err_after_req_alloc:
+	RaftHeapFree(truncate);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+#undef tracef
diff --git a/src/raft/uv_work.c b/src/raft/uv_work.c
new file mode 100644
index 000000000..5b6431b97
--- /dev/null
+++ b/src/raft/uv_work.c
@@ -0,0 +1,78 @@
+#include "assert.h"
+#include "heap.h"
+#include "uv.h"
+
+struct uvAsyncWork
+{
+	struct uv *uv;
+	struct raft_io_async_work *req;
+	struct uv_work_s work;
+	int status;
+	queue queue;
+};
+
+static void uvAsyncWorkCb(uv_work_t *work)
+{
+	struct uvAsyncWork *w = work->data;
+	assert(w != NULL);
+	int rv;
+	rv = w->req->work(w->req);
+	w->status = rv;
+}
+
+static void uvAsyncAfterWorkCb(uv_work_t *work, int status)
+{
+	struct uvAsyncWork *w = work->data;
+	struct raft_io_async_work *req = w->req;
+	int req_status = w->status;
+	struct uv *uv = w->uv;
+	assert(status == 0);
+
+	QUEUE_REMOVE(&w->queue);
+	RaftHeapFree(w);
+	req->cb(req, req_status);
+	uvMaybeFireCloseCb(uv);
+}
+
+int UvAsyncWork(struct raft_io *io,
+		struct raft_io_async_work *req,
+		raft_io_async_work_cb cb)
+{
+	struct uv *uv;
+	struct uvAsyncWork *async_work;
+	int rv;
+
+	uv = io->impl;
+	assert(!uv->closing);
+
+	async_work = RaftHeapMalloc(sizeof *async_work);
+	if (async_work == NULL) {
+		rv = RAFT_NOMEM;
+		goto err;
+	}
+
+	async_work->uv = uv;
+	async_work->req = req;
+	async_work->work.data = async_work;
+	req->cb = cb;
+
+	QUEUE_PUSH(&uv->async_work_reqs, &async_work->queue);
+	rv = uv_queue_work(uv->loop, &async_work->work, uvAsyncWorkCb,
+			   uvAsyncAfterWorkCb);
+	if (rv != 0) {
+		QUEUE_REMOVE(&async_work->queue);
+		tracef("async work: %s", uv_strerror(rv));
+		rv = RAFT_IOERR;
+		goto err_after_req_alloc;
+	}
+
+	return 0;
+
+err_after_req_alloc:
+	RaftHeapFree(async_work);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+#undef tracef
diff --git a/src/raft/uv_writer.c b/src/raft/uv_writer.c
new file mode 100644
index 000000000..b489765b6
--- /dev/null
+++ b/src/raft/uv_writer.c
@@ -0,0 +1,544 @@
+#include "uv_writer.h"
+
+#include <string.h>
+#include <unistd.h>
+
+#include "../raft.h"
+#include "assert.h"
+#include "heap.h"
+
+/* Copy the error message from the request object to the writer object. */
+static void uvWriterReqTransferErrMsg(struct UvWriterReq *req)
+{
+	ErrMsgPrintf(req->writer->errmsg, "%s", req->errmsg);
+}
+
+/* Set the request status according the given result code. */
+static void uvWriterReqSetStatus(struct UvWriterReq *req, int result)
+{
+	if (result < 0) {
+		ErrMsgPrintf(req->errmsg, "write failed: %d", result);
+		req->status = RAFT_IOERR;
+	} else if ((size_t)result < req->len) {
+		ErrMsgPrintf(req->errmsg,
+			     "short write: %d bytes instead of %zu", result,
+			     req->len);
+		req->status = RAFT_NOSPACE;
+	} else {
+		req->status = 0;
+	}
+}
+
+/* Remove the request from the queue of inflight writes and invoke the request
+ * callback if set. */
+static void uvWriterReqFinish(struct UvWriterReq *req)
+{
+	QUEUE_REMOVE(&req->queue);
+	if (req->status != 0) {
+		uvWriterReqTransferErrMsg(req);
+	}
+	req->cb(req, req->status);
+}
+
+/* Wrapper around the low-level OS syscall, providing a better error message. */
+static int uvWriterIoSetup(unsigned n, aio_context_t *ctx, char *errmsg)
+{
+	int rv;
+	rv = UvOsIoSetup(n, ctx);
+	if (rv != 0) {
+		switch (rv) {
+			case UV_EAGAIN:
+				ErrMsgPrintf(errmsg,
+					     "AIO events user limit exceeded");
+				rv = RAFT_TOOMANY;
+				break;
+			default:
+				UvOsErrMsg(errmsg, "io_setup", rv);
+				rv = RAFT_IOERR;
+				break;
+		}
+		return rv;
+	}
+	return 0;
+}
+
+/* Run blocking syscalls involved in a file write request.
+ *
+ * Perform a KAIO write request and synchronously wait for it to complete. */
+static void uvWriterWorkCb(uv_work_t *work)
+{
+	struct UvWriterReq *req; /* Writer request object */
+	struct UvWriter *w;      /* Writer object */
+	aio_context_t ctx;       /* KAIO handle */
+	struct iocb *iocbs;      /* Pointer to KAIO request object */
+	struct io_event event;   /* KAIO response object */
+	int n_events;
+	int rv;
+
+	req = work->data;
+	w = req->writer;
+
+	iocbs = &req->iocb;
+
+	/* If more than one write in parallel is allowed, submit the AIO request
+	 * using a dedicated context, to avoid synchronization issues between
+	 * threads when multiple writes are submitted in parallel. This is
+	 * suboptimal but in real-world users should use file systems and
+	 * kernels with proper async write support. */
+	if (w->n_events > 1) {
+		ctx = 0;
+		rv = uvWriterIoSetup(1 /* Maximum concurrent requests */, &ctx,
+				     req->errmsg);
+		if (rv != 0) {
+			goto out;
+		}
+	} else {
+		ctx = w->ctx;
+	}
+
+	/* Submit the request */
+	rv = UvOsIoSubmit(ctx, 1, &iocbs);
+	if (rv != 0) {
+		/* UNTESTED: since we're not using NOWAIT and the parameters are
+		 * valid, this shouldn't fail. */
+		UvOsErrMsg(req->errmsg, "io_submit", rv);
+		rv = RAFT_IOERR;
+		goto out_after_io_setup;
+	}
+
+	/* Wait for the request to complete */
+	n_events = UvOsIoGetevents(ctx, 1, 1, &event, NULL);
+	assert(n_events == 1);
+	if (n_events != 1) {
+		/* UNTESTED */
+		rv = n_events >= 0 ? -1 : n_events;
+	}
+
+out_after_io_setup:
+	if (w->n_events > 1) {
+		UvOsIoDestroy(ctx);
+	}
+
+out:
+	if (rv != 0) {
+		req->status = rv;
+	} else {
+		uvWriterReqSetStatus(req, (int)event.res);
+	}
+
+	return;
+}
+
+/* Callback run after writeWorkCb has returned. It normally invokes the write
+ * request callback. */
+static void uvWriterAfterWorkCb(uv_work_t *work, int status)
+{
+	struct UvWriterReq *req = work->data; /* Write file request object */
+	assert(status == 0); /* We don't cancel worker requests */
+	uvWriterReqFinish(req);
+}
+
+/* Callback fired when the event fd associated with AIO write requests should be
+ * ready for reading (i.e. when a write has completed). */
+static void uvWriterPollCb(uv_poll_t *poller, int status, int events)
+{
+	struct UvWriter *w = poller->data;
+	uint64_t completed; /* True if the write is complete */
+	unsigned i;
+	int n_events;
+	int rv;
+
+	assert(w->event_fd >= 0);
+	assert(status == 0);
+	if (status != 0) {
+		/* UNTESTED libuv docs: If an error happens while polling,
+		 * status will be < 0 and corresponds with one of the UV_E*
+		 * error codes. */
+		goto fail_requests;
+	}
+
+	assert(events & UV_READABLE);
+
+	/* Read the event file descriptor */
+	rv = (int)read(w->event_fd, &completed, sizeof completed);
+	if (rv != sizeof completed) {
+		/* UNTESTED: According to eventfd(2) this is the only possible
+		 * failure mode, meaning that epoll has indicated that the event
+		 * FD is not yet ready. */
+		assert(errno == EAGAIN);
+		return;
+	}
+
+	/* TODO: this assertion fails in unit tests */
+	/* assert(completed == 1); */
+
+	/* Try to fetch the write responses.
+	 *
+	 * If we got here at least one write should have completed and io_events
+	 * should return immediately without blocking. */
+	n_events =
+	    UvOsIoGetevents(w->ctx, 1, (long int)w->n_events, w->events, NULL);
+	assert(n_events >= 1);
+	if (n_events < 1) {
+		/* UNTESTED */
+		status = n_events == 0 ? -1 : n_events;
+		goto fail_requests;
+	}
+
+	for (i = 0; i < (unsigned)n_events; i++) {
+		struct io_event *event = &w->events[i];
+		struct UvWriterReq *req = *((void **)&event->data);
+
+		/* If we got EAGAIN, it means it was not possible to perform the
+		 * write asynchronously, so let's fall back to the threadpool.
+		 */
+		if (event->res == -EAGAIN) {
+			req->iocb.aio_flags &= (unsigned)~IOCB_FLAG_RESFD;
+			req->iocb.aio_resfd = 0;
+			req->iocb.aio_rw_flags &= ~RWF_NOWAIT;
+			assert(req->work.data == NULL);
+			req->work.data = req;
+			rv = uv_queue_work(w->loop, &req->work, uvWriterWorkCb,
+					   uvWriterAfterWorkCb);
+			if (rv != 0) {
+				/* UNTESTED: with the current libuv
+				 * implementation this should never fail. */
+				UvOsErrMsg(req->errmsg, "uv_queue_work", rv);
+				req->status = RAFT_IOERR;
+				goto finish;
+			}
+			return;
+		}
+
+		uvWriterReqSetStatus(req, (int)event->res);
+
+	finish:
+		uvWriterReqFinish(req);
+	}
+
+	return;
+
+fail_requests:
+	while (!QUEUE_IS_EMPTY(&w->poll_queue)) {
+		queue *head;
+		struct UvWriterReq *req;
+		head = QUEUE_HEAD(&w->poll_queue);
+		req = QUEUE_DATA(head, struct UvWriterReq, queue);
+		uvWriterReqSetStatus(req, status);
+		uvWriterReqFinish(req);
+	}
+}
+
+int UvWriterInit(struct UvWriter *w,
+		 struct uv_loop_s *loop,
+		 uv_file fd,
+		 bool direct /* Whether to use direct I/O */,
+		 bool async /* Whether async I/O is available */,
+		 unsigned max_concurrent_writes,
+		 char *errmsg)
+{
+	void *data = w->data;
+	int rv = 0;
+	memset(w, 0, sizeof *w);
+	w->data = data;
+	w->loop = loop;
+	w->fd = fd;
+	w->async = async;
+	w->ctx = 0;
+	w->events = NULL;
+	w->n_events = max_concurrent_writes;
+	w->event_fd = -1;
+	w->event_poller.data = NULL;
+	w->check.data = NULL;
+	w->close_cb = NULL;
+	QUEUE_INIT(&w->poll_queue);
+	QUEUE_INIT(&w->work_queue);
+	w->closing = false;
+	w->errmsg = errmsg;
+
+	/* Set direct I/O if available. */
+	if (direct) {
+		rv = UvOsSetDirectIo(w->fd);
+		if (rv != 0) {
+			UvOsErrMsg(errmsg, "fcntl", rv);
+			goto err;
+		}
+	}
+
+	/* Setup the AIO context. */
+	rv = uvWriterIoSetup(w->n_events, &w->ctx, errmsg);
+	if (rv != 0) {
+		goto err;
+	}
+
+	/* Initialize the array of re-usable event objects. */
+	w->events = RaftHeapCalloc(w->n_events, sizeof *w->events);
+	if (w->events == NULL) {
+		/* UNTESTED: todo */
+		ErrMsgOom(errmsg);
+		rv = RAFT_NOMEM;
+		goto err_after_io_setup;
+	}
+
+	/* Create an event file descriptor to get notified when a write has
+	 * completed. */
+	rv = UvOsEventfd(0, UV_FS_O_NONBLOCK);
+	if (rv < 0) {
+		/* UNTESTED: should fail only with ENOMEM */
+		UvOsErrMsg(errmsg, "eventfd", rv);
+		rv = RAFT_IOERR;
+		goto err_after_events_alloc;
+	}
+	w->event_fd = rv;
+
+	rv = uv_poll_init(loop, &w->event_poller, w->event_fd);
+	if (rv != 0) {
+		/* UNTESTED: with the current libuv implementation this should
+		 * never fail. */
+		UvOsErrMsg(errmsg, "uv_poll_init", rv);
+		rv = RAFT_IOERR;
+		goto err_after_event_fd;
+	}
+	w->event_poller.data = w;
+
+	rv = uv_check_init(loop, &w->check);
+	if (rv != 0) {
+		/* UNTESTED: with the current libuv implementation this should
+		 * never fail. */
+		UvOsErrMsg(errmsg, "uv_check_init", rv);
+		rv = RAFT_IOERR;
+		goto err_after_event_fd;
+	}
+	w->check.data = w;
+
+	rv = uv_poll_start(&w->event_poller, UV_READABLE, uvWriterPollCb);
+	if (rv != 0) {
+		/* UNTESTED: with the current libuv implementation this should
+		 * never fail. */
+		UvOsErrMsg(errmsg, "uv_poll_start", rv);
+		rv = RAFT_IOERR;
+		goto err_after_event_fd;
+	}
+
+	return 0;
+
+err_after_event_fd:
+	UvOsClose(w->event_fd);
+err_after_events_alloc:
+	RaftHeapFree(w->events);
+err_after_io_setup:
+	UvOsIoDestroy(w->ctx);
+err:
+	assert(rv != 0);
+	return rv;
+}
+
+static void uvWriterCleanUpAndFireCloseCb(struct UvWriter *w)
+{
+	assert(w->closing);
+
+	UvOsClose(w->fd);
+	RaftHeapFree(w->events);
+	UvOsIoDestroy(w->ctx);
+
+	if (w->close_cb != NULL) {
+		w->close_cb(w);
+	}
+}
+
+static void uvWriterPollerCloseCb(struct uv_handle_s *handle)
+{
+	struct UvWriter *w = handle->data;
+	w->event_poller.data = NULL;
+
+	/* Cancel all pending requests. */
+	while (!QUEUE_IS_EMPTY(&w->poll_queue)) {
+		queue *head;
+		struct UvWriterReq *req;
+		head = QUEUE_HEAD(&w->poll_queue);
+		req = QUEUE_DATA(head, struct UvWriterReq, queue);
+		assert(req->work.data == NULL);
+		req->status = RAFT_CANCELED;
+		uvWriterReqFinish(req);
+	}
+
+	if (w->check.data != NULL) {
+		return;
+	}
+
+	uvWriterCleanUpAndFireCloseCb(w);
+}
+
+static void uvWriterCheckCloseCb(struct uv_handle_s *handle)
+{
+	struct UvWriter *w = handle->data;
+	w->check.data = NULL;
+	if (w->event_poller.data != NULL) {
+		return;
+	}
+	uvWriterCleanUpAndFireCloseCb(w);
+}
+
+static void uvWriterCheckCb(struct uv_check_s *check)
+{
+	struct UvWriter *w = check->data;
+	if (!QUEUE_IS_EMPTY(&w->work_queue)) {
+		return;
+	}
+	uv_close((struct uv_handle_s *)&w->check, uvWriterCheckCloseCb);
+}
+
+void UvWriterClose(struct UvWriter *w, UvWriterCloseCb cb)
+{
+	int rv;
+	assert(!w->closing);
+	w->closing = true;
+	w->close_cb = cb;
+
+	/* We can close the event file descriptor right away, but we shouldn't
+	 * close the main file descriptor or destroy the AIO context since there
+	 * might be threadpool requests in flight. */
+	UvOsClose(w->event_fd);
+
+	rv = uv_poll_stop(&w->event_poller);
+	assert(rv == 0); /* Can this ever fail? */
+
+	uv_close((struct uv_handle_s *)&w->event_poller, uvWriterPollerCloseCb);
+
+	/* If we have requests executing in the threadpool, we need to wait for
+	 * them. That's done in the check callback. */
+	if (!QUEUE_IS_EMPTY(&w->work_queue)) {
+		uv_check_start(&w->check, uvWriterCheckCb);
+	} else {
+		uv_close((struct uv_handle_s *)&w->check, uvWriterCheckCloseCb);
+	}
+}
+
+/* Return the total lengths of the given buffers. */
+static size_t lenOfBufs(const uv_buf_t bufs[], unsigned n)
+{
+	size_t len = 0;
+	unsigned i;
+	for (i = 0; i < n; i++) {
+		len += bufs[i].len;
+	}
+	return len;
+}
+
+int UvWriterSubmit(struct UvWriter *w,
+		   struct UvWriterReq *req,
+		   const uv_buf_t bufs[],
+		   unsigned n,
+		   size_t offset,
+		   UvWriterReqCb cb)
+{
+	int rv = 0;
+	struct iocb *iocbs = &req->iocb;
+	assert(!w->closing);
+
+	/* TODO: at the moment we are not leveraging the support for concurrent
+	 *       writes, so ensure that we're getting write requests
+	 *       sequentially. */
+	if (w->n_events == 1) {
+		assert(QUEUE_IS_EMPTY(&w->poll_queue));
+		assert(QUEUE_IS_EMPTY(&w->work_queue));
+	}
+
+	assert(w->fd >= 0);
+	assert(w->event_fd >= 0);
+	assert(w->ctx != 0);
+	assert(req != NULL);
+	assert(bufs != NULL);
+	assert(n > 0);
+
+	req->writer = w;
+	req->len = lenOfBufs(bufs, n);
+	req->status = -1;
+	req->work.data = NULL;
+	req->cb = cb;
+	memset(&req->iocb, 0, sizeof req->iocb);
+	memset(req->errmsg, 0, sizeof req->errmsg);
+
+	req->iocb.aio_fildes = (uint32_t)w->fd;
+	req->iocb.aio_lio_opcode = IOCB_CMD_PWRITEV;
+	req->iocb.aio_reqprio = 0;
+	*((void **)(&req->iocb.aio_buf)) = (void *)bufs;
+	req->iocb.aio_nbytes = n;
+	req->iocb.aio_offset = (int64_t)offset;
+	*((void **)(&req->iocb.aio_data)) = (void *)req;
+
+#if defined(RWF_HIPRI)
+	/* High priority request, if possible */
+	/* TODO: do proper kernel feature detection for this one. */
+	/* req->iocb.aio_rw_flags |= RWF_HIPRI; */
+#endif
+
+#if defined(RWF_DSYNC)
+	/* Use per-request synchronous I/O if available. Otherwise, we have
+	 * opened the file with O_DSYNC. */
+	/* TODO: do proper kernel feature detection for this one. */
+	/* req->iocb.aio_rw_flags |= RWF_DSYNC; */
+#endif
+
+	/* If io_submit can be run in a 100% non-blocking way, we'll try to
+	 * write without using the threadpool. */
+	if (w->async) {
+		req->iocb.aio_flags |= IOCB_FLAG_RESFD;
+		req->iocb.aio_resfd = (uint32_t)w->event_fd;
+		req->iocb.aio_rw_flags |= RWF_NOWAIT;
+	}
+
+	/* Try to submit the write request asynchronously */
+	if (w->async) {
+		QUEUE_PUSH(&w->poll_queue, &req->queue);
+		rv = UvOsIoSubmit(w->ctx, 1, &iocbs);
+
+		/* If no error occurred, we're done, the write request was
+		 * submitted. */
+		if (rv == 0) {
+			goto done;
+		}
+
+		QUEUE_REMOVE(&req->queue);
+
+		/* Check the reason of the error. */
+		switch (rv) {
+			case UV_EAGAIN:
+				break;
+			default:
+				/* Unexpected error */
+				UvOsErrMsg(w->errmsg, "io_submit", rv);
+				rv = RAFT_IOERR;
+				goto err;
+		}
+
+		/* Submitting the write would block, or NOWAIT is not
+		 * supported. Let's run this request in the threadpool. */
+		req->iocb.aio_flags &= (unsigned)~IOCB_FLAG_RESFD;
+		req->iocb.aio_resfd = 0;
+		req->iocb.aio_rw_flags &= ~RWF_NOWAIT;
+	}
+
+	/* If we got here it means we need to run io_submit in the threadpool.
+	 */
+	QUEUE_PUSH(&w->work_queue, &req->queue);
+	req->work.data = req;
+	rv = uv_queue_work(w->loop, &req->work, uvWriterWorkCb,
+			   uvWriterAfterWorkCb);
+	if (rv != 0) {
+		/* UNTESTED: with the current libuv implementation this can't
+		 * fail. */
+		req->work.data = NULL;
+		QUEUE_REMOVE(&req->queue);
+		UvOsErrMsg(w->errmsg, "uv_queue_work", rv);
+		rv = RAFT_IOERR;
+		goto err;
+	}
+
+done:
+	return 0;
+
+err:
+	assert(rv != 0);
+	return rv;
+}
diff --git a/src/raft/uv_writer.h b/src/raft/uv_writer.h
new file mode 100644
index 000000000..db8f5c293
--- /dev/null
+++ b/src/raft/uv_writer.h
@@ -0,0 +1,78 @@
+/* Asynchronous API to write a file. */
+
+#ifndef UV_WRITER_H_
+#define UV_WRITER_H_
+
+#include <stdbool.h>
+
+#include "err.h"
+#include "queue.h"
+#include "uv_os.h"
+
+/* Perform asynchronous writes to a single file. */
+struct UvWriter;
+
+/* Callback called after the memory associated with a file handle can be
+ * released. */
+typedef void (*UvWriterCloseCb)(struct UvWriter *w);
+
+struct UvWriter
+{
+	void *data;              /* User data */
+	struct uv_loop_s *loop;  /* Event loop */
+	uv_file fd;              /* File handle */
+	bool async;              /* Whether fully async I/O is supported */
+	aio_context_t ctx;       /* KAIO handle */
+	struct io_event *events; /* Array of KAIO response objects */
+	unsigned n_events;       /* Length of the events array */
+	int event_fd;            /* Poll'ed to check if write is finished */
+	struct uv_poll_s
+	    event_poller;        /* Poll event_fd for completed poll requests */
+	struct uv_check_s check; /* Check for completed threadpool requests */
+	UvWriterCloseCb close_cb; /* Close callback */
+	queue poll_queue;         /* Pollable write requests */
+	queue work_queue;         /* Threadpool write requests */
+	bool closing;             /* Whether we're closing or closed */
+	char *errmsg;             /* Description of last error */
+};
+
+/* Initialize a file writer. */
+int UvWriterInit(struct UvWriter *w,
+		 struct uv_loop_s *loop,
+		 uv_file fd,
+		 bool direct /* Whether to use direct I/O */,
+		 bool async /* Whether async I/O is available */,
+		 unsigned max_concurrent_writes,
+		 char *errmsg);
+
+/* Close the given file and release all associated resources. */
+void UvWriterClose(struct UvWriter *w, UvWriterCloseCb cb);
+
+/* Write request. */
+struct UvWriterReq;
+
+/* Callback called after a write request has been completed. */
+typedef void (*UvWriterReqCb)(struct UvWriterReq *req, int status);
+
+struct UvWriterReq
+{
+	void *data;              /* User data */
+	struct UvWriter *writer; /* Originating writer */
+	size_t len;              /* Total number of bytes to write */
+	int status;              /* Request result code */
+	struct uv_work_s work;   /* To execute logic in the threadpool */
+	UvWriterReqCb cb; /* Callback to invoke upon request completion */
+	struct iocb iocb; /* KAIO request (for writing) */
+	char errmsg[256]; /* Error description (for thread-safety) */
+	queue queue;      /* Prev/next links in the inflight queue */
+};
+
+/* Asynchronously write data to the underlying file. */
+int UvWriterSubmit(struct UvWriter *w,
+		   struct UvWriterReq *req,
+		   const uv_buf_t bufs[],
+		   unsigned n,
+		   size_t offset,
+		   UvWriterReqCb cb);
+
+#endif /* UV_WRITER_H_ */
diff --git a/src/roles.c b/src/roles.c
index 8993b5e19..21fb6bc5f 100644
--- a/src/roles.c
+++ b/src/roles.c
@@ -1,9 +1,8 @@
 #include <stdlib.h>
 
-#include <raft.h>
-
 #include "client/protocol.h"
 #include "lib/queue.h"
+#include "raft.h"
 #include "roles.h"
 #include "server.h"
 #include "translate.h"
diff --git a/src/server.h b/src/server.h
index c5509e3e6..f6678323a 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1,8 +1,6 @@
 #ifndef DQLITE_SERVER_H
 #define DQLITE_SERVER_H
 
-#include <raft.h>
-#include <raft/uv.h>
 #include <sqlite3.h>
 
 #include <semaphore.h>
@@ -12,6 +10,7 @@
 #include "id.h"
 #include "lib/assert.h"
 #include "logger.h"
+#include "raft.h"
 #include "registry.h"
 
 #define DQLITE_ERRMSG_BUF_SIZE 300
diff --git a/src/translate.c b/src/translate.c
index 673cc3c20..32938b414 100644
--- a/src/translate.c
+++ b/src/translate.c
@@ -1,10 +1,9 @@
 #include "translate.h"
 
-#include <raft.h>
-
 #include "assert.h"
 #include "leader.h"
 #include "protocol.h"
+#include "raft.h"
 
 /* Translate a raft error to a dqlite one. */
 int translateRaftErrCode(int code)
diff --git a/src/transport.c b/src/transport.c
index 0607ec8e9..98750a74f 100644
--- a/src/transport.c
+++ b/src/transport.c
@@ -1,6 +1,5 @@
 #include "lib/transport.h"
 
-#include <raft.h>
 #include <sqlite3.h>
 #include <stdlib.h>
 #include <string.h>
@@ -9,6 +8,7 @@
 #include "lib/addr.h"
 #include "message.h"
 #include "protocol.h"
+#include "raft.h"
 #include "request.h"
 #include "tracing.h"
 #include "transport.h"
diff --git a/src/transport.h b/src/transport.h
index 37d3629ac..5d1d3800f 100644
--- a/src/transport.h
+++ b/src/transport.h
@@ -9,7 +9,7 @@
 #ifndef TRANSPORT_H_
 #define TRANSPORT_H_
 
-#include <raft/uv.h>
+#include "raft.h"
 
 #include "../include/dqlite.h"
 
diff --git a/src/vfs.c b/src/vfs.c
index 418a60a5f..de247ca07 100644
--- a/src/vfs.c
+++ b/src/vfs.c
@@ -9,8 +9,6 @@
 #include <sys/time.h>
 #include <unistd.h>
 
-#include <raft.h>
-
 #include <sqlite3.h>
 
 #include "../include/dqlite.h"
@@ -19,6 +17,7 @@
 #include "lib/byte.h"
 
 #include "format.h"
+#include "raft.h"
 #include "tracing.h"
 #include "vfs.h"
 
diff --git a/test/integration/test_vfs.c b/test/integration/test_vfs.c
index ea4c4d207..3335c228c 100644
--- a/test/integration/test_vfs.c
+++ b/test/integration/test_vfs.c
@@ -1,4 +1,3 @@
-#include <raft.h>
 #include <stdio.h>
 
 #include "../lib/fs.h"
@@ -7,6 +6,7 @@
 #include "../lib/sqlite.h"
 
 #include "../../include/dqlite.h"
+#include "../../src/raft.h"
 
 #include <sys/mman.h>
 
diff --git a/test/lib/cluster.h b/test/lib/cluster.h
index 760bd8d40..fdc9f1988 100644
--- a/test/lib/cluster.h
+++ b/test/lib/cluster.h
@@ -15,11 +15,9 @@
 #ifndef TEST_CLUSTER_H
 #define TEST_CLUSTER_H
 
-#include <raft.h>
-#include <raft/fixture.h>
-
 #include "../../src/config.h"
 #include "../../src/fsm.h"
+#include "../../src/raft.h"
 #include "../../src/registry.h"
 #include "../../src/vfs.h"
 
diff --git a/test/lib/raft.h b/test/lib/raft.h
index fed669b5d..a36cfc461 100644
--- a/test/lib/raft.h
+++ b/test/lib/raft.h
@@ -5,11 +5,10 @@
 #ifndef TEST_RAFT_H
 #define TEST_RAFT_H
 
-#include <raft.h>
-#include <raft/uv.h>
 #include <uv.h>
 
 #include "../../src/fsm.h"
+#include "../../src/raft.h"
 #include "../../src/transport.h"
 #include "fs.h"
 #include "logger.h"
diff --git a/test/lib/raft_heap.c b/test/lib/raft_heap.c
index 524529fe8..04b4dc92a 100644
--- a/test/lib/raft_heap.c
+++ b/test/lib/raft_heap.c
@@ -1,4 +1,4 @@
-#include <raft.h>
+#include "../../src/raft.h"
 
 #include "fault.h"
 #include "raft_heap.h"
diff --git a/test/raft/fuzzy/main_core.c b/test/raft/fuzzy/main_core.c
new file mode 100644
index 000000000..807f4a72d
--- /dev/null
+++ b/test/raft/fuzzy/main_core.c
@@ -0,0 +1,11 @@
+#include "../lib/runner.h"
+
+MunitSuite _main_suites[64];
+int _main_suites_n = 0;
+
+/* Test runner executable */
+int main(int argc, char *argv[MUNIT_ARRAY_PARAM(argc)])
+{
+    MunitSuite suite = {(char *)"", NULL, _main_suites, 1, 0};
+    return munit_suite_main(&suite, (void *)"unit", argc, argv);
+}
diff --git a/test/raft/fuzzy/test_election.c b/test/raft/fuzzy/test_election.c
new file mode 100644
index 000000000..de6b0340b
--- /dev/null
+++ b/test/raft/fuzzy/test_election.c
@@ -0,0 +1,103 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+static char *cluster_n[] = {"3", "4", "5", "7", NULL};
+static char *cluster_pre_vote[] = {"0", "1", NULL};
+
+static MunitParameterEnum _params[] = {
+    {CLUSTER_N_PARAM, cluster_n},
+    {CLUSTER_PRE_VOTE_PARAM, cluster_pre_vote},
+    {NULL, NULL},
+};
+
+static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(0);
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_RANDOMIZE;
+    CLUSTER_START;
+    return f;
+}
+
+static void tear_down(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Tests
+ *
+ *****************************************************************************/
+
+SUITE(election)
+
+/* A leader is eventually elected */
+TEST(election, win, setup, tear_down, 0, _params)
+{
+    struct fixture *f = data;
+    CLUSTER_STEP_UNTIL_HAS_LEADER(10000);
+    return MUNIT_OK;
+}
+
+/* A new leader is elected if the current one dies. */
+TEST(election, change, setup, tear_down, 0, _params)
+{
+    struct fixture *f = data;
+    CLUSTER_STEP_UNTIL_HAS_LEADER(10000);
+    CLUSTER_KILL_LEADER;
+    CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(20000);
+    return MUNIT_OK;
+}
+
+/* A new leader is elected if the current one dies and a previously killed
+ * server with an outdated log and outdated term is revived.  */
+TEST(election, changeReviveOutdated, setup, tear_down, 0, _params)
+{
+    struct fixture *f = data;
+    unsigned i;
+
+    /* Kill a random server */
+    i = ((unsigned)rand()) % CLUSTER_N;
+    CLUSTER_KILL(i);
+
+    /* Server i's term will be lower than the term of the election. */
+    CLUSTER_STEP_UNTIL_HAS_LEADER(20000);
+
+    /* Add some entries to the log */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_KILL_LEADER;
+    CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000);
+
+    /* Revive server i with an outdated log and term, the cluster
+     * should be able to elect a new leader */
+    CLUSTER_REVIVE(i);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(20000);
+    return MUNIT_OK;
+}
+
+/* If no majority of servers is online, no leader is elected. */
+TEST(election, noQuorum, setup, tear_down, 0, _params)
+{
+    struct fixture *f = data;
+    CLUSTER_KILL_MAJORITY;
+    CLUSTER_STEP_UNTIL_ELAPSED(30000);
+    munit_assert_false(CLUSTER_HAS_LEADER);
+    return MUNIT_OK;
+}
diff --git a/test/raft/fuzzy/test_liveness.c b/test/raft/fuzzy/test_liveness.c
new file mode 100644
index 000000000..98bfe0fd8
--- /dev/null
+++ b/test/raft/fuzzy/test_liveness.c
@@ -0,0 +1,154 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+/* Maximum number of cluster loop iterations each test should perform. */
+#define MAX_ITERATIONS 25000
+
+/* Maximum number of cluster loop iterations a pair of servers should stay
+ * disconnected. */
+#define MAX_DISCONNECT 150
+
+struct disconnection
+{
+    unsigned id1;
+    unsigned id2;
+    int start;
+    int duration;
+};
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+    struct disconnection *disconnections;
+};
+
+static char *cluster_n[] = {"3", "4", NULL};
+static char *cluster_pre_vote[] = {"0", "1", NULL};
+
+static MunitParameterEnum _params[] = {
+    {CLUSTER_N_PARAM, cluster_n},
+    {CLUSTER_PRE_VOTE_PARAM, cluster_pre_vote},
+    {NULL, NULL},
+};
+
+/* Return the number of distinct server pairs in the cluster. */
+static int __server_pairs(struct fixture *f)
+{
+    return CLUSTER_N * (CLUSTER_N - 1) / 2;
+}
+
+/* Update the cluster connectivity for the given iteration. */
+static void __update_connectivity(struct fixture *f, int i)
+{
+    int p;
+    int pairs = __server_pairs(f);
+
+    for (p = 0; p < pairs; p++) {
+        struct disconnection *disconnection = &f->disconnections[p];
+        unsigned id1 = disconnection->id1;
+        unsigned id2 = disconnection->id2;
+
+        if (disconnection->start == 0) {
+            /* Decide whether to disconnect this pair. */
+            if (munit_rand_int_range(1, 10) <= 1) {
+                disconnection->start = i;
+                disconnection->duration =
+                    munit_rand_int_range(50, MAX_DISCONNECT);
+                raft_fixture_saturate(&f->cluster, id1 - 1, id2 - 1);
+                raft_fixture_saturate(&f->cluster, id2 - 1, id1 - 1);
+            }
+        } else {
+            /* Decide whether to reconnect this pair. */
+            if (i - disconnection->start > disconnection->duration) {
+                raft_fixture_desaturate(&f->cluster, id1 - 1, id2 - 1);
+                raft_fixture_desaturate(&f->cluster, id2 - 1, id1 - 1);
+                disconnection->start = 0;
+            }
+        }
+    }
+}
+
+static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    int pairs;
+    size_t i, j, k;
+    SETUP_CLUSTER(0);
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_RANDOMIZE;
+    CLUSTER_START;
+
+    /* Number of distinct pairs of servers. */
+    pairs = __server_pairs(f);
+
+    f->disconnections = munit_malloc(pairs * sizeof *f->disconnections);
+
+    k = 0;
+    for (i = 0; i < CLUSTER_N; i++) {
+        for (j = i + 1; j < CLUSTER_N; j++) {
+            struct disconnection *disconnection = &f->disconnections[k];
+            disconnection->id1 = i + 1;
+            disconnection->id2 = j + 1;
+            disconnection->start = 0;
+            disconnection->duration = 0;
+            k++;
+        }
+    }
+
+    return f;
+}
+
+static void tear_down(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f->disconnections);
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Tests
+ *
+ *****************************************************************************/
+
+SUITE(liveness)
+
+static void apply_cb(struct raft_apply *req, int status, void *result)
+{
+    (void)status;
+    (void)result;
+    free(req);
+}
+
+/* The system makes progress even in case of network disruptions. */
+TEST(liveness, networkDisconnect, setup, tear_down, 0, _params)
+{
+    struct fixture *f = data;
+    int i = 0;
+
+    (void)params;
+
+    for (i = 0; i < MAX_ITERATIONS; i++) {
+        __update_connectivity(f, i);
+        raft_fixture_step(&f->cluster);
+
+        if (CLUSTER_LEADER != CLUSTER_N) {
+            struct raft_apply *req = munit_malloc(sizeof *req);
+            CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req, 1, apply_cb);
+            if (CLUSTER_LAST_APPLIED(CLUSTER_LEADER) >= 2) {
+                break;
+            }
+        }
+    }
+
+    // munit_assert_int(CLUSTER_LAST_APPLIED(CLUSTER_LEADER), >=, 2);
+
+    return MUNIT_OK;
+}
diff --git a/test/raft/fuzzy/test_membership.c b/test/raft/fuzzy/test_membership.c
new file mode 100644
index 000000000..00b3e9205
--- /dev/null
+++ b/test/raft/fuzzy/test_membership.c
@@ -0,0 +1,113 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+    struct raft_change req;
+};
+
+static char *cluster_n[] = {"3", "4", "5", NULL};
+
+static MunitParameterEnum _params[] = {
+    {CLUSTER_N_PARAM, cluster_n},
+    {NULL, NULL},
+};
+
+static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(0);
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_RANDOMIZE;
+    CLUSTER_START;
+    CLUSTER_STEP_UNTIL_HAS_LEADER(10000);
+    return f;
+}
+
+static void tear_down(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Tests
+ *
+ *****************************************************************************/
+
+SUITE(membership)
+
+TEST(membership, addNonVoting, setup, tear_down, 0, _params)
+{
+    struct fixture *f = data;
+    const struct raft_server *server;
+    struct raft *raft;
+
+    CLUSTER_ADD(&f->req);
+    CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 3, 2000);
+
+    /* Then promote it. */
+    CLUSTER_ASSIGN(&f->req, RAFT_STANDBY);
+
+    CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 4, 2000);
+
+    raft = CLUSTER_RAFT(CLUSTER_LEADER);
+
+    server = &raft->configuration.servers[CLUSTER_N - 1];
+    munit_assert_int(server->id, ==, CLUSTER_N);
+
+    return MUNIT_OK;
+}
+
+TEST(membership, addVoting, setup, tear_down, 0, _params)
+{
+    struct fixture *f = data;
+    const struct raft_server *server;
+    struct raft *raft;
+
+    (void)params;
+
+    CLUSTER_ADD(&f->req);
+    CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 3, 2000);
+
+    /* Then promote it. */
+    CLUSTER_ASSIGN(&f->req, RAFT_VOTER);
+
+    CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 4, 2000);
+
+    raft = CLUSTER_RAFT(CLUSTER_LEADER);
+
+    server = &raft->configuration.servers[CLUSTER_N - 1];
+    munit_assert_int(server->role, ==, RAFT_VOTER);
+
+    return MUNIT_OK;
+}
+
+TEST(membership, removeVoting, setup, tear_down, 0, _params)
+{
+    struct fixture *f = data;
+    struct raft *raft;
+    int rv;
+
+    (void)params;
+
+    raft = CLUSTER_RAFT(CLUSTER_LEADER);
+
+    rv = raft_remove(raft, &f->req, CLUSTER_LEADER % CLUSTER_N + 1, NULL);
+    munit_assert_int(rv, ==, 0);
+
+    CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 3, 2000);
+
+    munit_assert_int(raft->configuration.n, ==, CLUSTER_N - 1);
+
+    return 0;
+}
diff --git a/test/raft/fuzzy/test_replication.c b/test/raft/fuzzy/test_replication.c
new file mode 100644
index 000000000..22821e00c
--- /dev/null
+++ b/test/raft/fuzzy/test_replication.c
@@ -0,0 +1,175 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+static char *cluster_n[] = {"3", "5", "7", NULL};
+
+static MunitParameterEnum _params[] = {
+    {CLUSTER_N_PARAM, cluster_n},
+    {NULL, NULL},
+};
+
+static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(0);
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_RANDOMIZE;
+    CLUSTER_START;
+    CLUSTER_STEP_UNTIL_HAS_LEADER(10000);
+    return f;
+}
+
+static void tear_down(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+#define APPLY_ADD_ONE(REQ) CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, REQ, 1, NULL)
+
+/******************************************************************************
+ *
+ * Tests
+ *
+ *****************************************************************************/
+
+SUITE(replication)
+
+/* New entries on the leader are eventually replicated to followers. */
+TEST(replication, appendEntries, setup, tear_down, 0, _params)
+{
+    struct fixture *f = data;
+    struct raft_apply *req = munit_malloc(sizeof *req);
+    (void)params;
+    APPLY_ADD_ONE(req);
+    CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 3, 2000);
+    free(req);
+    return MUNIT_OK;
+}
+
+/* The cluster remains available even if the current leader dies and a new
+ * leader gets elected. */
+TEST(replication, availability, setup, tear_down, 0, _params)
+{
+    struct fixture *f = data;
+    struct raft_apply *req1 = munit_malloc(sizeof *req1);
+    struct raft_apply *req2 = munit_malloc(sizeof *req2);
+
+    (void)params;
+
+    APPLY_ADD_ONE(req1);
+    CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 3, 2000);
+
+    CLUSTER_KILL_LEADER;
+    CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(10000);
+
+    APPLY_ADD_ONE(req2);
+    /* Index 3 -> 5 = APPLY entry + BARRIER entry after becoming leader */
+    CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 5, 2000);
+
+    free(req1);
+    free(req2);
+
+    return MUNIT_OK;
+}
+
+static void apply_cb(struct raft_apply *req, int status, void *result)
+{
+    (void)status;
+    (void)result;
+    free(req);
+}
+
+/* If no quorum is available, entries don't get committed. */
+TEST(replication, noQuorum, setup, tear_down, 0, _params)
+{
+    struct fixture *f = data;
+    struct raft_apply *req = munit_malloc(sizeof *req);
+    unsigned i;
+
+    (void)params;
+
+    CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req, 1, apply_cb);
+    CLUSTER_KILL_MAJORITY;
+
+    CLUSTER_STEP_UNTIL_ELAPSED(10000);
+
+    for (i = 0; i < CLUSTER_N; i++) {
+        munit_assert_int(CLUSTER_LAST_APPLIED(i), ==, 1);
+    }
+
+    return MUNIT_OK;
+}
+
+/* If the cluster is partitioned, entries don't get committed. */
+TEST(replication, partitioned, setup, tear_down, 0, _params)
+{
+    struct fixture *f = data;
+    struct raft_apply *req1 = munit_malloc(sizeof *req1);
+    struct raft_apply *req2 = munit_malloc(sizeof *req2);
+    unsigned leader_id;
+    size_t i;
+    size_t n;
+
+    (void)params;
+
+    leader_id = CLUSTER_LEADER + 1;
+
+    /* Disconnect the leader from a majority of servers */
+    n = 0;
+    for (i = 0; n < (CLUSTER_N / 2) + 1; i++) {
+        struct raft *raft = CLUSTER_RAFT(i);
+        if (raft->id == leader_id) {
+            continue;
+        }
+        raft_fixture_saturate(&f->cluster, leader_id - 1, raft->id - 1);
+        raft_fixture_saturate(&f->cluster, raft->id - 1, leader_id - 1);
+        n++;
+    }
+
+    /* Try to append a new entry using the disconnected leader. */
+    CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req1, 1, apply_cb);
+
+    /* The leader gets deposed. */
+    CLUSTER_STEP_UNTIL_HAS_NO_LEADER(10000);
+
+    /* The entry does not get committed. */
+    CLUSTER_STEP_UNTIL_ELAPSED(5000);
+
+    /* Reconnect the old leader */
+    for (i = 0; i < CLUSTER_N; i++) {
+        struct raft *raft = CLUSTER_RAFT(i);
+        if (raft->id == leader_id) {
+            continue;
+        }
+        raft_fixture_desaturate(&f->cluster, leader_id - 1, raft->id - 1);
+    }
+
+    // TODO this fails with seed 0x3914306f
+    CLUSTER_STEP_UNTIL_HAS_LEADER(30000);
+
+    /* Re-try now to append the entry. */
+    CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req2, 1, apply_cb);
+    CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, 2, 10000);
+
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/append_helpers.h b/test/raft/integration/append_helpers.h
new file mode 100644
index 000000000..59c1bbf38
--- /dev/null
+++ b/test/raft/integration/append_helpers.h
@@ -0,0 +1,102 @@
+#include "../../../src/raft/uv.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+struct result
+{
+    int status;
+    bool done;
+    void *data;
+};
+
+static void appendCbAssertResult(struct raft_io_append *req, int status)
+{
+    struct result *result = req->data;
+    munit_assert_int(status, ==, result->status);
+    result->done = true;
+}
+
+/* Declare and fill the entries array for the append request identified by
+ * I. The array will have N entries, and each entry will have a data buffer of
+ * SIZE bytes.*/
+#define ENTRIES(I, N, SIZE)                                 \
+    struct raft_entry _entries##I[N];                       \
+    uint8_t _entries_data##I[N * SIZE];                     \
+    {                                                       \
+        int _i;                                             \
+        for (_i = 0; _i < N; _i++) {                        \
+            struct raft_entry *entry = &_entries##I[_i];    \
+            entry->term = 1;                                \
+            entry->type = RAFT_COMMAND;                     \
+            entry->buf.base = &_entries_data##I[_i * SIZE]; \
+            entry->buf.len = SIZE;                          \
+            entry->batch = NULL;                            \
+            munit_assert_ptr_not_null(entry->buf.base);     \
+            memset(entry->buf.base, 0, entry->buf.len);     \
+            uint64_t _temporary = f->count;                 \
+            memcpy(entry->buf.base, &_temporary, 8);        \
+            f->count++;                                     \
+        }                                                   \
+    }
+
+/* Submit an append request identified by I, with N_ENTRIES entries, each one of
+ * size ENTRY_SIZE. When the append request completes, CB will be called
+ * and DATA will be available in result->data. f->io.append is expected to
+ * return RV. */
+#define APPEND_SUBMIT_CB_DATA(I, N_ENTRIES, ENTRY_SIZE, CB, DATA, RV)    \
+    struct raft_io_append _req##I;                                       \
+    struct result _result##I = {0, false, DATA};                         \
+    int _rv##I;                                                          \
+    ENTRIES(I, N_ENTRIES, ENTRY_SIZE);                                   \
+    _req##I.data = &_result##I;                                          \
+    _rv##I = f->io.append(&f->io, &_req##I, _entries##I, N_ENTRIES, CB); \
+    munit_assert_int(_rv##I, ==, RV)
+
+/* Submit an append request identified by I, with N_ENTRIES entries, each one of
+ * size ENTRY_SIZE. The default expectation is for the operation to succeed. A
+ * custom STATUS can be set with APPEND_EXPECT. */
+#define APPEND_SUBMIT(I, N_ENTRIES, ENTRY_SIZE)                           \
+    APPEND_SUBMIT_CB_DATA(I, N_ENTRIES, ENTRY_SIZE, appendCbAssertResult, \
+                          NULL, 0)
+
+/* Try to submit an append request and assert that the given error code and
+ * message are returned. */
+#define APPEND_ERROR(N_ENTRIES, ENTRY_SIZE, RV, ERRMSG)                \
+    do {                                                               \
+        struct raft_io_append _req;                                    \
+        int _rv;                                                       \
+        ENTRIES(0, N_ENTRIES, ENTRY_SIZE);                             \
+        _rv = f->io.append(&f->io, &_req, _entries0, N_ENTRIES, NULL); \
+        munit_assert_int(_rv, ==, RV);                                 \
+        /* munit_assert_string_equal(f->io.errmsg, ERRMSG);*/          \
+    } while (0)
+
+#define APPEND_EXPECT(I, STATUS) _result##I.status = STATUS
+
+/* Wait for the append request identified by I to complete. */
+#define APPEND_WAIT(I) LOOP_RUN_UNTIL(&_result##I.done)
+
+/* Submit an append request with an entries array with N_ENTRIES entries, each
+ * one of size ENTRY_SIZE, and wait for the operation to successfully
+ * complete. */
+#define APPEND(N_ENTRIES, ENTRY_SIZE)            \
+    do {                                         \
+        APPEND_SUBMIT(0, N_ENTRIES, ENTRY_SIZE); \
+        APPEND_WAIT(0);                          \
+    } while (0)
+
+/* Submit an append request with the given parameters and wait for the operation
+ * to fail with the given code and message. */
+#define APPEND_FAILURE(N_ENTRIES, ENTRY_SIZE, STATUS, ERRMSG) \
+    {                                                         \
+        APPEND_SUBMIT(0, N_ENTRIES, ENTRY_SIZE);              \
+        APPEND_EXPECT(0, STATUS);                             \
+        APPEND_WAIT(0);                                       \
+        f->count--;                                           \
+        munit_assert_string_equal(f->io.errmsg, ERRMSG);      \
+    }
diff --git a/test/raft/integration/main_core.c b/test/raft/integration/main_core.c
new file mode 100644
index 000000000..ad1798bba
--- /dev/null
+++ b/test/raft/integration/main_core.c
@@ -0,0 +1,3 @@
+#include "../lib/runner.h"
+
+RUNNER("core")
diff --git a/test/raft/integration/main_uv.c b/test/raft/integration/main_uv.c
new file mode 100644
index 000000000..7f2eba543
--- /dev/null
+++ b/test/raft/integration/main_uv.c
@@ -0,0 +1,3 @@
+#include "../lib/runner.h"
+
+RUNNER("uv")
diff --git a/test/raft/integration/test_apply.c b/test/raft/integration/test_apply.c
new file mode 100644
index 000000000..650df5a93
--- /dev/null
+++ b/test/raft/integration/test_apply.c
@@ -0,0 +1,160 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(2);
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+    CLUSTER_ELECT(0);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+struct result
+{
+    int status;
+    bool done;
+    raft_index prev_applied;
+    struct raft *raft;
+};
+
+static void applyCbAssertResult(struct raft_apply *req, int status, void *_)
+{
+    struct result *result = req->data;
+    (void)_;
+    munit_assert_int(status, ==, result->status);
+    if (status == 0) {
+        munit_assert_ulong(result->prev_applied, <,
+                           raft_last_applied(result->raft));
+    }
+    result->done = true;
+}
+
+static bool applyCbHasFired(struct raft_fixture *f, void *arg)
+{
+    struct result *result = arg;
+    (void)f;
+    return result->done;
+}
+
+/* Submit an apply request. */
+#define APPLY_SUBMIT(I, N)                                                   \
+    struct raft_buffer _buf;                                                 \
+    struct raft_apply _req;                                                  \
+    struct raft *r = CLUSTER_RAFT(I);                                        \
+    struct result _result = {0, false, raft_last_applied(r), r};             \
+    int _rv;                                                                 \
+    FsmEncodeSetX(N, &_buf);                                                 \
+    _req.data = &_result;                                                    \
+    _rv = raft_apply(CLUSTER_RAFT(I), &_req, &_buf, 1, applyCbAssertResult); \
+    munit_assert_int(_rv, ==, 0);
+
+/* Expect the apply callback to fire with the given status. */
+#define APPLY_EXPECT(STATUS) _result.status = STATUS
+
+/* Wait until an apply request completes. */
+#define APPLY_WAIT CLUSTER_STEP_UNTIL(applyCbHasFired, &_result, 2000)
+
+/* Submit to the I'th server a request to apply a new RAFT_COMMAND entry and
+ * wait for the operation to succeed. */
+#define APPLY(I, N)         \
+    do {                    \
+        APPLY_SUBMIT(I, N); \
+        APPLY_WAIT;         \
+    } while (0)
+
+/* Submit to the I'th server a request to apply a new RAFT_COMMAND entry and
+ * assert that the given error is returned. */
+#define APPLY_ERROR(I, RV, ERRMSG)                                \
+    do {                                                          \
+        struct raft_buffer _buf;                                  \
+        struct raft_apply _req;                                   \
+        int _rv;                                                  \
+        FsmEncodeSetX(123, &_buf);                                \
+        _rv = raft_apply(CLUSTER_RAFT(I), &_req, &_buf, 1, NULL); \
+        munit_assert_int(_rv, ==, RV);                            \
+        munit_assert_string_equal(CLUSTER_ERRMSG(I), ERRMSG);     \
+        raft_free(_buf.base);                                     \
+    } while (0)
+
+/******************************************************************************
+ *
+ * Success scenarios
+ *
+ *****************************************************************************/
+
+SUITE(raft_apply)
+
+/* Append the very first command entry. */
+TEST(raft_apply, first, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    int val = 123;
+    APPLY(0, val);
+    munit_assert_int(FsmGetX(CLUSTER_FSM(0)), ==, val);
+    return MUNIT_OK;
+}
+
+/* Append two command entries. */
+TEST(raft_apply, two, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    int val = 123;
+    APPLY(0, val);
+    munit_assert_int(FsmGetX(CLUSTER_FSM(0)), ==, val);
+    val = 124;
+    APPLY(0, val);
+    munit_assert_int(FsmGetX(CLUSTER_FSM(0)), ==, val);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * Failure scenarios
+ *
+ *****************************************************************************/
+
+/* If the raft instance is not in leader state, an error is returned. */
+TEST(raft_apply, notLeader, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPLY_ERROR(1, RAFT_NOTLEADER, "server is not the leader");
+    return MUNIT_OK;
+}
+
+/* If the raft instance steps down from leader state, the apply callback fires
+ * with an error. */
+TEST(raft_apply, leadershipLost, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPLY_SUBMIT(0, 123);
+    APPLY_EXPECT(RAFT_LEADERSHIPLOST);
+    CLUSTER_DEPOSE;
+    APPLY_WAIT;
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_assign.c b/test/raft/integration/test_assign.c
new file mode 100644
index 000000000..7404b3fe6
--- /dev/null
+++ b/test/raft/integration/test_assign.c
@@ -0,0 +1,457 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+struct result
+{
+    int status;
+    bool done;
+};
+
+/* Add a an empty server to the cluster and start it. */
+#define GROW                                \
+    {                                       \
+        int rv__;                           \
+        CLUSTER_GROW;                       \
+        rv__ = raft_start(CLUSTER_RAFT(2)); \
+        munit_assert_int(rv__, ==, 0);      \
+    }
+
+static void changeCbAssertResult(struct raft_change *req, int status)
+{
+    struct result *result = req->data;
+    munit_assert_int(status, ==, result->status);
+    result->done = true;
+}
+
+static bool changeCbHasFired(struct raft_fixture *f, void *arg)
+{
+    struct result *result = arg;
+    (void)f;
+    return result->done;
+}
+
+/* Submit an add request. */
+#define ADD_SUBMIT(I, ID)                                                     \
+    struct raft_change _req;                                                  \
+    char _address[16];                                                        \
+    struct result _result = {0, false};                                       \
+    int _rv;                                                                  \
+    _req.data = &_result;                                                     \
+    sprintf(_address, "%d", ID);                                              \
+    _rv =                                                                     \
+        raft_add(CLUSTER_RAFT(I), &_req, ID, _address, changeCbAssertResult); \
+    munit_assert_int(_rv, ==, 0);
+
+#define ADD(I, ID)                                            \
+    do {                                                      \
+        ADD_SUBMIT(I, ID);                                    \
+        CLUSTER_STEP_UNTIL(changeCbHasFired, &_result, 2000); \
+    } while (0)
+
+/* Submit an assign role request. */
+#define ASSIGN_SUBMIT(I, ID, ROLE)                                             \
+    struct raft_change _req;                                                   \
+    struct result _result = {0, false};                                        \
+    int _rv;                                                                   \
+    _req.data = &_result;                                                      \
+    _rv = raft_assign(CLUSTER_RAFT(I), &_req, ID, ROLE, changeCbAssertResult); \
+    munit_assert_int(_rv, ==, 0);
+
+/* Expect the request callback to fire with the given status. */
+#define ASSIGN_EXPECT(STATUS) _result.status = STATUS;
+
+/* Wait until a promote request completes. */
+#define ASSIGN_WAIT CLUSTER_STEP_UNTIL(changeCbHasFired, &_result, 10000)
+
+/* Submit a request to assign the I'th server to the given role and wait for the
+ * operation to succeed. */
+#define ASSIGN(I, ID, ROLE)         \
+    do {                            \
+        ASSIGN_SUBMIT(I, ID, ROLE); \
+        ASSIGN_WAIT;                \
+    } while (0)
+
+/* Invoke raft_assign() against the I'th server and assert it the given error
+ * code. */
+#define ASSIGN_ERROR(I, ID, ROLE, RV, ERRMSG)                        \
+    {                                                                \
+        struct raft_change __req;                                    \
+        int __rv;                                                    \
+        __rv = raft_assign(CLUSTER_RAFT(I), &__req, ID, ROLE, NULL); \
+        munit_assert_int(__rv, ==, RV);                              \
+        munit_assert_string_equal(ERRMSG, CLUSTER_ERRMSG(I));        \
+    }
+
+/******************************************************************************
+ *
+ * Set up a cluster of 2 servers, with the first as leader.
+ *
+ *****************************************************************************/
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(2);
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+    CLUSTER_ELECT(0);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Assertions
+ *
+ *****************************************************************************/
+
+/* Assert the values of the committed and uncommitted configuration indexes on
+ * the raft instance with the given index. */
+#define ASSERT_CONFIGURATION_INDEXES(I, COMMITTED, UNCOMMITTED)                \
+    {                                                                          \
+        struct raft *raft_ = CLUSTER_RAFT(I);                                  \
+        munit_assert_int(raft_->configuration_committed_index, ==, COMMITTED); \
+        munit_assert_int(raft_->configuration_uncommitted_index, ==,           \
+                         UNCOMMITTED);                                         \
+    }
+
+/* Assert that the state of the current catch up round matches the given
+ * values. */
+#define ASSERT_CATCH_UP_ROUND(I, PROMOTEED_ID, NUMBER, DURATION)              \
+    {                                                                         \
+        struct raft *raft_ = CLUSTER_RAFT(I);                                 \
+        munit_assert_int(raft_->leader_state.promotee_id, ==, PROMOTEED_ID);  \
+        munit_assert_int(raft_->leader_state.round_number, ==, NUMBER);       \
+        munit_assert_int(                                                     \
+            raft_->io->time(raft_->io) - raft_->leader_state.round_start, >=, \
+            DURATION);                                                        \
+    }
+
+/******************************************************************************
+ *
+ * raft_assign
+ *
+ *****************************************************************************/
+
+SUITE(raft_assign)
+
+/* Assigning the voter role to a spare server whose log is already up-to-date
+ * results in the relevant configuration change to be submitted immediately. */
+TEST(raft_assign, promoteUpToDate, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft;
+    const struct raft_server *server;
+    GROW;
+    ADD(0, 3);
+    CLUSTER_STEP_N(3);
+
+    ASSIGN(0, 3, RAFT_VOTER);
+
+    /* Server 3 is being considered as voting, even though the configuration
+     * change is not committed yet. */
+    raft = CLUSTER_RAFT(0);
+    server = &raft->configuration.servers[2];
+    munit_assert_int(server->role, ==, RAFT_VOTER);
+
+    /* The configuration change request eventually succeeds. */
+    CLUSTER_STEP_UNTIL_APPLIED(0, 3, 2000);
+
+    return MUNIT_OK;
+}
+
+static bool thirdServerHasCaughtUp(struct raft_fixture *f, void *arg)
+{
+    struct raft *raft = raft_fixture_get(f, 0);
+    (void)arg;
+    return raft->leader_state.promotee_id == 0;
+}
+
+/* Assigning the voter role to a spare server whose log is not up-to-date
+ * results in catch-up rounds to start. When the server has caught up, the
+ * configuration change request gets submitted. */
+TEST(raft_assign, promoteCatchUp, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft;
+    const struct raft_server *server;
+    CLUSTER_MAKE_PROGRESS;
+    GROW;
+    ADD(0, 3);
+
+    ASSIGN_SUBMIT(0, 3, RAFT_VOTER);
+
+    /* Server 3 is not being considered as voting, since its log is behind. */
+    raft = CLUSTER_RAFT(0);
+    server = &raft->configuration.servers[2];
+    munit_assert_int(server->role, ==, RAFT_SPARE);
+
+    /* Advance the match index of server 3, by acknowledging the AppendEntries
+     * request that the leader has sent to it. */
+    CLUSTER_STEP_UNTIL_APPLIED(2, 3, 2000);
+
+    /* Disconnect the second server, so it doesn't participate in the quorum */
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+
+    /* Eventually the leader notices that the third server has caught. */
+    CLUSTER_STEP_UNTIL(thirdServerHasCaughtUp, NULL, 2000);
+
+    /* The leader has submitted a configuration change request, but it's
+     * uncommitted. */
+    ASSERT_CONFIGURATION_INDEXES(0, 4, 5);
+
+    /* The third server notifies that it has appended the new
+     * configuration. Since it's considered voting already, it counts for the
+     * majority and the entry gets committed. */
+    CLUSTER_STEP_UNTIL_APPLIED(0, 5, 2000);
+    CLUSTER_STEP_UNTIL_APPLIED(2, 5, 2000);
+
+    /* The promotion is completed. */
+    ASSERT_CONFIGURATION_INDEXES(0, 5, 0);
+
+    return MUNIT_OK;
+}
+
+static bool thirdServerHasCompletedFirstRound(struct raft_fixture *f, void *arg)
+{
+    struct raft *raft = raft_fixture_get(f, 0);
+    (void)arg;
+    return raft->leader_state.round_number != 1;
+}
+
+/* Assigning the voter role to a spare a server whose log is not up-to-date
+ * results in catch-up rounds to start. If new entries are appended after a
+ * round is started, a new round is initiated once the former one completes. */
+TEST(raft_assign, promoteNewRound, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    unsigned election_timeout = CLUSTER_RAFT(0)->election_timeout;
+    struct raft_apply *req = munit_malloc(sizeof *req);
+    CLUSTER_MAKE_PROGRESS;
+    GROW;
+    ADD(0, 3);
+
+    ASSIGN_SUBMIT(0, 3, RAFT_VOTER);
+    ASSERT_CATCH_UP_ROUND(0, 3, 1, 0);
+
+    /* Now that the catch-up round started, submit a new entry and set a very
+     * high latency on the server being promoted, so it won't deliver
+     * AppendEntry results within the round duration. */
+    CLUSTER_APPLY_ADD_X(0, req, 1, NULL);
+    CLUSTER_STEP_UNTIL_ELAPSED(election_timeout + 100);
+
+    // FIXME: unstable with 0xcf1f25b6
+    // ASSERT_CATCH_UP_ROUND(0, 3, 1, election_timeout + 100);
+
+    /* The leader eventually receives the AppendEntries result from the
+     * promotee, acknowledging all entries except the last one. The first round
+     * has completes and a new one has starts. */
+    CLUSTER_STEP_UNTIL(thirdServerHasCompletedFirstRound, NULL, 2000);
+
+    /* Eventually the server is promoted and everyone applies the entry. */
+    CLUSTER_STEP_UNTIL_APPLIED(0, req->index, 5000);
+
+    /* The promotion is eventually completed. */
+    CLUSTER_STEP_UNTIL_APPLIED(0, req->index + 1, 5000);
+    ASSERT_CONFIGURATION_INDEXES(0, 6, 0);
+
+    free(req);
+
+    return MUNIT_SKIP;
+}
+
+static bool secondServerHasNewConfiguration(struct raft_fixture *f, void *arg)
+{
+    struct raft *raft = raft_fixture_get(f, 1);
+    (void)arg;
+    return raft->configuration.servers[2].role == RAFT_VOTER;
+}
+
+/* If a follower receives an AppendEntries RPC containing a RAFT_CHANGE entry
+ * which changes the role of a server, the configuration change is immediately
+ * applied locally, even if the entry is not yet committed. Once the entry is
+ * committed, the change becomes permanent.*/
+TEST(raft_assign, changeIsImmediate, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    GROW;
+    CLUSTER_MAKE_PROGRESS;
+    ADD(0, 3);
+    CLUSTER_STEP_UNTIL_APPLIED(1, 4, 2000);
+
+    ASSIGN_SUBMIT(0, 3, RAFT_VOTER);
+    CLUSTER_STEP_UNTIL(secondServerHasNewConfiguration, NULL, 3000);
+    ASSERT_CONFIGURATION_INDEXES(1, 4, 5);
+
+    ASSIGN_WAIT;
+
+    return MUNIT_OK;
+}
+
+/* Assign the stand-by role to an idle server. */
+TEST(raft_assign, promoteToStandBy, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    GROW;
+    ADD(0, 3);
+    ASSIGN(0, 3, RAFT_STANDBY);
+    return MUNIT_OK;
+}
+
+/* Trying to promote a server on a raft instance which is not the leader results
+ * in an error. */
+TEST(raft_assign, notLeader, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ASSIGN_ERROR(1, 3, RAFT_VOTER, RAFT_NOTLEADER, "server is not the leader");
+    return MUNIT_OK;
+}
+
+/* Trying to change the role of a server whose ID is unknown results in an
+ * error. */
+TEST(raft_assign, unknownId, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ASSIGN_ERROR(0, 3, RAFT_VOTER, RAFT_NOTFOUND, "no server has ID 3");
+    return MUNIT_OK;
+}
+
+/* Trying to promote a server to an unknown role in an. */
+TEST(raft_assign, badRole, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ASSIGN_ERROR(0, 3, 999, RAFT_BADROLE, "server role is not valid");
+    return MUNIT_OK;
+}
+
+/* Trying to assign the voter role to a server which has already it results in
+ * an error. */
+TEST(raft_assign, alreadyHasRole, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ASSIGN_ERROR(0, 1, RAFT_VOTER, RAFT_BADROLE, "server is already voter");
+    return MUNIT_OK;
+}
+
+/* Trying to assign a new role to a server while a configuration change is in
+ * progress results in an error. */
+TEST(raft_assign, changeRequestAlreadyInProgress, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    GROW;
+    ADD(0, 3);
+    ASSIGN_SUBMIT(0, 3, RAFT_VOTER);
+    ASSIGN_ERROR(0, 3, RAFT_VOTER, RAFT_CANTCHANGE,
+                 "a configuration change is already in progress");
+    ASSIGN_WAIT;
+    return MUNIT_OK;
+}
+
+/* If leadership is lost before the configuration change log entry for setting
+ * the new server role is committed, the leader configuration gets rolled back
+ * and the role of server being changed is reverted. */
+TEST(raft_assign, leadershipLost, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    const struct raft_server *server;
+    /* TODO: fix */
+    return MUNIT_SKIP;
+    GROW;
+    ADD(0, 3);
+    CLUSTER_STEP_N(2);
+
+    ASSIGN_SUBMIT(0, 3, RAFT_VOTER);
+
+    /* Server 3 is being considered as voting, even though the configuration
+     * change is not committed yet. */
+    ASSERT_CATCH_UP_ROUND(0, 0, 0, 0);
+    ASSERT_CONFIGURATION_INDEXES(0, 2, 3);
+    server = configurationGet(&CLUSTER_RAFT(0)->configuration, 3);
+    munit_assert_int(server->role, ==, RAFT_VOTER);
+
+    /* Lose leadership. */
+    CLUSTER_DEPOSE;
+
+    /* A new leader gets elected */
+    CLUSTER_ELECT(1);
+    CLUSTER_STEP_N(5);
+
+    /* Server 3 is not being considered voting anymore. */
+    server = configurationGet(&CLUSTER_RAFT(0)->configuration, 3);
+    munit_assert_int(server->role, ==, RAFT_STANDBY);
+
+    return MUNIT_OK;
+}
+
+/* Trying to assign the voter role to an unresponsive server eventually
+ * fails. */
+TEST(raft_assign, promoteUnresponsive, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_MAKE_PROGRESS;
+    GROW;
+    ADD(0, 3);
+
+    ASSIGN_SUBMIT(0, 3, RAFT_VOTER);
+    CLUSTER_KILL(2);
+
+    ASSIGN_EXPECT(RAFT_NOCONNECTION);
+    ASSIGN_WAIT;
+
+    return MUNIT_OK;
+}
+
+/* Demote a voter node to stand-by. */
+TEST(raft_assign, demoteToStandBy, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ASSIGN(0, 2, RAFT_STANDBY);
+    return MUNIT_OK;
+}
+
+/* The leader can be demoted to stand-by and will no longer act as leader */
+TEST(raft_assign, demoteLeader, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ASSIGN_SUBMIT(0, 1, RAFT_STANDBY);
+    munit_assert_int(CLUSTER_LEADER, ==, 0);
+    ASSIGN_WAIT;
+    CLUSTER_STEP_UNTIL_HAS_LEADER(5000);
+    munit_assert_int(CLUSTER_LEADER, !=, 0);
+    return MUNIT_OK;
+}
+
+/* The leader can be demoted to spare and will no longer act as leader */
+TEST(raft_assign, demoteLeaderToSpare, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ASSIGN_SUBMIT(0, 1, RAFT_SPARE);
+    munit_assert_int(CLUSTER_LEADER, ==, 0);
+    ASSIGN_WAIT;
+    CLUSTER_STEP_UNTIL_HAS_LEADER(5000);
+    munit_assert_int(CLUSTER_LEADER, !=, 0);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_barrier.c b/test/raft/integration/test_barrier.c
new file mode 100644
index 000000000..8d95a8095
--- /dev/null
+++ b/test/raft/integration/test_barrier.c
@@ -0,0 +1,94 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(2);
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+    CLUSTER_ELECT(0);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+struct result
+{
+    int status;
+    bool done;
+};
+
+static void barrierCbAssertResult(struct raft_barrier *req, int status)
+{
+    struct result *result = req->data;
+    munit_assert_int(status, ==, result->status);
+    result->done = true;
+}
+
+static bool barrierCbHasFired(struct raft_fixture *f, void *arg)
+{
+    struct result *result = arg;
+    (void)f;
+    return result->done;
+}
+
+/* Submit a barrier request. */
+#define BARRIER_SUBMIT(I)                                              \
+    struct raft_barrier _req;                                          \
+    struct result _result = {0, false};                                \
+    int _rv;                                                           \
+    _req.data = &_result;                                              \
+    _rv = raft_barrier(CLUSTER_RAFT(I), &_req, barrierCbAssertResult); \
+    munit_assert_int(_rv, ==, 0);
+
+/* Expect the barrier callback to fire with the given status. */
+#define BARRIER_EXPECT(STATUS) _result.status = STATUS
+
+/* Wait until the barrier request completes. */
+#define BARRIER_WAIT CLUSTER_STEP_UNTIL(barrierCbHasFired, &_result, 2000)
+
+/* Submit to the I'th server a barrier request and wait for the operation to
+ * succeed. */
+#define BARRIER(I)         \
+    do {                   \
+        BARRIER_SUBMIT(I); \
+        BARRIER_WAIT;      \
+    } while (0)
+
+/******************************************************************************
+ *
+ * Success scenarios
+ *
+ *****************************************************************************/
+
+SUITE(raft_barrier)
+
+TEST(raft_barrier, cb, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    BARRIER(0);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_bootstrap.c b/test/raft/integration/test_bootstrap.c
new file mode 100644
index 000000000..43043f967
--- /dev/null
+++ b/test/raft/integration/test_bootstrap.c
@@ -0,0 +1,57 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture holding a pristine raft instance.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(1);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Bootstrap tests.
+ *
+ *****************************************************************************/
+
+SUITE(raft_bootstrap)
+
+/* Attempting to bootstrap an instance that's already started results in
+ * RAFT_BUSY. */
+TEST(raft_bootstrap, busy, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft;
+    struct raft_configuration configuration;
+    int rv;
+
+    /* Bootstrap and the first server. */
+    CLUSTER_BOOTSTRAP_N_VOTING(1);
+    CLUSTER_START;
+
+    raft = CLUSTER_RAFT(0);
+    CLUSTER_CONFIGURATION(&configuration);
+    rv = raft_bootstrap(raft, &configuration);
+    munit_assert_int(rv, ==, RAFT_BUSY);
+    raft_configuration_close(&configuration);
+
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_digest.c b/test/raft/integration/test_digest.c
new file mode 100644
index 000000000..98e5ee9e0
--- /dev/null
+++ b/test/raft/integration/test_digest.c
@@ -0,0 +1,14 @@
+#include "../../../src/raft.h"
+#include "../lib/runner.h"
+
+SUITE(raft_digest)
+
+/* Generation of the ID of the bootstrap dqlite node. */
+TEST(raft_digest, bootstrapServerId, NULL, NULL, 0, NULL)
+{
+    const char *address = "127.0.0.1:65536";
+    unsigned long long id;
+    id = raft_digest(address, 0);
+    munit_assert_int(id, ==, 138882483);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_election.c b/test/raft/integration/test_election.c
new file mode 100644
index 000000000..d67b8e8ff
--- /dev/null
+++ b/test/raft/integration/test_election.c
@@ -0,0 +1,800 @@
+#include "../../../src/raft/configuration.h"
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    unsigned i;
+    SETUP_CLUSTER(2);
+    CLUSTER_BOOTSTRAP;
+    for (i = 0; i < CLUSTER_N; i++) {
+        struct raft *raft = CLUSTER_RAFT(i);
+        raft->data = f;
+    }
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Parameters
+ *
+ *****************************************************************************/
+
+static char *cluster_5[] = {"5", NULL};
+
+static MunitParameterEnum cluster_5_params[] = {
+    {CLUSTER_N_PARAM, cluster_5},
+    {NULL, NULL},
+};
+
+static char *cluster_3[] = {"3", NULL};
+
+static MunitParameterEnum cluster_3_params[] = {
+    {CLUSTER_N_PARAM, cluster_3},
+    {NULL, NULL},
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+/* Wait until the I'th server becomes candidate. */
+#define STEP_UNTIL_CANDIDATE(I) \
+    CLUSTER_STEP_UNTIL_STATE_IS(I, RAFT_CANDIDATE, 2000)
+
+/* Wait until the I'th server becomes leader. */
+#define STEP_UNTIL_LEADER(I) CLUSTER_STEP_UNTIL_STATE_IS(I, RAFT_LEADER, 2000)
+
+/******************************************************************************
+ *
+ * Assertions
+ *
+ *****************************************************************************/
+
+/* Assert that the I'th server is in follower state. */
+#define ASSERT_FOLLOWER(I) munit_assert_int(CLUSTER_STATE(I), ==, RAFT_FOLLOWER)
+
+/* Assert that the I'th server is in candidate state. */
+#define ASSERT_CANDIDATE(I) \
+    munit_assert_int(CLUSTER_STATE(I), ==, RAFT_CANDIDATE)
+
+/* Assert that the I'th server is in leader state. */
+#define ASSERT_LEADER(I) munit_assert_int(CLUSTER_STATE(I), ==, RAFT_LEADER)
+
+/* Assert that the I'th server is unavailable. */
+#define ASSERT_UNAVAILABLE(I) \
+    munit_assert_int(CLUSTER_STATE(I), ==, RAFT_UNAVAILABLE)
+
+/* Assert that the I'th server has voted for the server with the given ID. */
+#define ASSERT_VOTED_FOR(I, ID) munit_assert_int(CLUSTER_VOTED_FOR(I), ==, ID)
+
+/* Assert that the I'th server has the given current term. */
+#define ASSERT_TERM(I, TERM)                             \
+    {                                                    \
+        struct raft *raft_ = CLUSTER_RAFT(I);            \
+        munit_assert_int(raft_->current_term, ==, TERM); \
+    }
+
+/* Assert that the fixture time matches the given value */
+#define ASSERT_TIME(TIME) munit_assert_int(CLUSTER_TIME, ==, TIME)
+
+/******************************************************************************
+ *
+ * Successful election round
+ *
+ *****************************************************************************/
+
+SUITE(election)
+
+/* Test an election round with two voters. */
+TEST(election, twoVoters, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+    CLUSTER_START;
+
+    /* The first server eventually times out and converts to candidate. */
+    STEP_UNTIL_CANDIDATE(0);
+    ASSERT_TIME(1000);
+
+    CLUSTER_STEP; /* Server 1 tick */
+    ASSERT_FOLLOWER(1);
+
+    CLUSTER_STEP; /* Server 0 completes sending a RequestVote RPC */
+    CLUSTER_STEP; /* Server 1 receives RequestVote RPC */
+    ASSERT_VOTED_FOR(1, 1);
+    ASSERT_TIME(1015);
+
+    CLUSTER_STEP; /* Server 1 completes sending RequestVote RPC */
+    CLUSTER_STEP; /* Server 1 receives RequestVote RPC result */
+    ASSERT_LEADER(0);
+    ASSERT_TIME(1030);
+
+    return MUNIT_OK;
+}
+
+/* If we have already voted and the same candidate requests the vote again, the
+ * vote is granted. */
+TEST(election, grantAgain, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 1, 10000);
+    raft_set_election_timeout(CLUSTER_RAFT(1), 10000);
+    CLUSTER_START;
+
+    /* The first server converts to candidate. */
+    STEP_UNTIL_CANDIDATE(0);
+    ASSERT_TIME(1000);
+
+    CLUSTER_STEP; /* Server 1 tick */
+    ASSERT_FOLLOWER(1);
+
+    /* Disconnect the second server, so the first server does not receive the
+     * result and eventually starts a new election round. */
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+    CLUSTER_STEP_UNTIL_TERM_IS(0, 3, 2000);
+    ASSERT_CANDIDATE(0);
+    ASSERT_TIME(2000);
+
+    /* Reconnecting the two servers eventually makes the first server win the
+     * election. */
+    CLUSTER_DESATURATE_BOTHWAYS(0, 1);
+    STEP_UNTIL_LEADER(0);
+    ASSERT_TIME(2030);
+
+    return MUNIT_OK;
+}
+
+/* If the requester last log entry index is the same, the vote is granted. */
+TEST(election, grantIfLastIndexIsSame, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry1;
+    struct raft_entry entry2;
+    (void)params;
+
+    entry1.type = RAFT_COMMAND;
+    entry1.term = 1;
+    FsmEncodeSetX(1, &entry1.buf);
+
+    entry2.type = RAFT_COMMAND;
+    entry2.term = 1;
+    FsmEncodeSetX(1, &entry2.buf);
+
+    CLUSTER_ADD_ENTRY(0, &entry1);
+    CLUSTER_ADD_ENTRY(1, &entry2);
+    CLUSTER_SET_TERM(1, 2);
+
+    CLUSTER_START;
+
+    /* The first server converts to candidate. */
+    STEP_UNTIL_CANDIDATE(0);
+
+    /* The first server eventually receives a RequestVote result RPC and
+     * converts to leader */
+    STEP_UNTIL_LEADER(0);
+    ASSERT_TIME(1030);
+
+    return MUNIT_OK;
+}
+
+/* If the requester last log entry index is higher, the vote is granted. */
+TEST(election, grantIfLastIndexIsHigher, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry;
+    (void)params;
+
+    entry.type = RAFT_COMMAND;
+    entry.term = 1;
+    FsmEncodeSetX(1, &entry.buf);
+
+    CLUSTER_ADD_ENTRY(0, &entry);
+    CLUSTER_SET_TERM(1, 2);
+
+    CLUSTER_START;
+
+    /* The first server converts to candidate. */
+    STEP_UNTIL_CANDIDATE(0);
+
+    /* The second server grants its vote. */
+    CLUSTER_STEP_UNTIL_VOTED_FOR(1, 0, 2000);
+
+    /* The first server receives a RequestVote result RPC and converts to
+     * leader */
+    CLUSTER_STEP_N(2);
+    ASSERT_LEADER(0);
+
+    return MUNIT_OK;
+}
+
+/* If a candidate receives a vote request response granting the vote but the
+ * quorum is not reached, it stays candidate. */
+TEST(election, waitQuorum, setUp, tearDown, 0, cluster_5_params)
+{
+    struct fixture *f = data;
+    (void)params;
+    CLUSTER_START;
+
+    /* The first server converts to candidate. */
+    STEP_UNTIL_CANDIDATE(0);
+
+    /* All servers grant their vote. */
+    CLUSTER_STEP_UNTIL_VOTED_FOR(1, 0, 2000);
+    CLUSTER_STEP_UNTIL_VOTED_FOR(2, 0, 2000);
+    CLUSTER_STEP_UNTIL_VOTED_FOR(3, 0, 2000);
+    CLUSTER_STEP_UNTIL_VOTED_FOR(4, 0, 2000);
+    ASSERT_TIME(1015);
+
+    /* The first server receives the first RequestVote result RPC but stays
+     * candidate since it has only 2 votes, and 3 are required. */
+    CLUSTER_STEP_N(4); /* Send completes on all other servers */
+    CLUSTER_STEP;      /* First message is delivered */
+    ASSERT_TIME(1030);
+    ASSERT_CANDIDATE(0);
+
+    /* Eventually we are elected */
+    CLUSTER_STEP;     /* Second message is delivered */
+    ASSERT_LEADER(0); /* Server 0 reaches the quorum */
+    ASSERT_TIME(1030);
+
+    return MUNIT_OK;
+}
+
+/* The vote request gets rejected if our term is higher. */
+TEST(election, rejectIfHigherTerm, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    CLUSTER_SET_TERM(1, 3);
+    CLUSTER_START;
+
+    /* The first server converts to candidate. */
+    STEP_UNTIL_CANDIDATE(0);
+
+    CLUSTER_STEP_N(3); /* Server 1 tick and RequestVote send/delivery */
+
+    /* The second server receives a RequestVote RPC and rejects the vote for the
+     * first server. */
+    ASSERT_VOTED_FOR(1, 0);
+
+    CLUSTER_STEP_N(2); /* RequestVote result send/delivery */
+
+    /* The first server receives the RequestVote result RPC and converts to
+     * follower because it discovers the newer term. */
+    ASSERT_FOLLOWER(0);
+
+    return 0;
+}
+
+/* If the server already has a leader, the vote is not granted (even if the
+ * request has a higher term). */
+TEST(election, rejectIfHasLeader, setUp, tearDown, 0, cluster_3_params)
+{
+    struct fixture *f = data;
+    (void)params;
+    CLUSTER_START;
+
+    /* Server 0 wins the elections. */
+    STEP_UNTIL_LEADER(0);
+
+    /* Server 2 gets disconnected and becomes candidate. */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    STEP_UNTIL_CANDIDATE(2);
+
+    /* Server 2 stays candidate since its requests get rejected. */
+    CLUSTER_STEP_N(20);
+    ASSERT_CANDIDATE(2);
+
+    return MUNIT_OK;
+}
+
+/* If a server has already voted, vote is not granted. */
+TEST(election, rejectIfAlreadyVoted, setUp, tearDown, 0, cluster_3_params)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Disconnect server 1 from server 0 and change its randomized election
+     * timeout to match the one of server 0. This way server 1 will convert to
+     * candidate but not receive vote requests. */
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 1, 1000);
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+
+    CLUSTER_START;
+
+    /* Server 0 and server 1 both become candidates. */
+    STEP_UNTIL_CANDIDATE(0);
+    STEP_UNTIL_CANDIDATE(1);
+    ASSERT_TIME(1000);
+
+    /* Server 2 receives the vote request from server 0 and grants it. */
+    CLUSTER_STEP_UNTIL_VOTED_FOR(2, 0, 2000);
+    ASSERT_TIME(1015);
+
+    /* Server 0 receives the vote result from server 2 and becomes leader. */
+    STEP_UNTIL_LEADER(0);
+    ASSERT_TIME(1030);
+
+    /* Server 1 is still candidate because its vote request got rejected. */
+    ASSERT_CANDIDATE(1);
+
+    return MUNIT_OK;
+}
+
+/* If the requester last log entry term is lower than ours, the vote is not
+ * granted. */
+TEST(election, rejectIfLastTermIsLower, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry1;
+    struct raft_entry entry2;
+    (void)params;
+
+    entry1.type = RAFT_COMMAND;
+    entry1.term = 1;
+    FsmEncodeSetX(123, &entry1.buf);
+
+    entry2.type = RAFT_COMMAND;
+    entry2.term = 2;
+    FsmEncodeSetX(456, &entry2.buf);
+
+    CLUSTER_ADD_ENTRY(0, &entry1);
+    CLUSTER_ADD_ENTRY(1, &entry2);
+
+    CLUSTER_START;
+
+    /* The first server becomes candidate. */
+    STEP_UNTIL_CANDIDATE(0);
+    ASSERT_TIME(1000);
+
+    /* The second server receives a RequestVote RPC and rejects the vote for the
+     * first server. */
+    CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100);
+    ASSERT_VOTED_FOR(1, 0);
+    ASSERT_TIME(1015);
+
+    /* The first server receives the response and stays candidate. */
+    CLUSTER_STEP_UNTIL_DELIVERED(1, 0, 100);
+    ASSERT_CANDIDATE(0);
+    ASSERT_TIME(1030);
+
+    /* Eventually the second server becomes leader because it has a longer
+     * log. */
+    STEP_UNTIL_LEADER(1);
+    ASSERT_TIME(1130);
+
+    return MUNIT_OK;
+}
+
+/* If the requester last log entry index is the lower, the vote is not
+ * granted. */
+TEST(election, rejectIfLastIndexIsLower, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry;
+    (void)params;
+
+    entry.type = RAFT_COMMAND;
+    entry.term = 2;
+    FsmEncodeSetX(123, &entry.buf);
+
+    CLUSTER_ADD_ENTRY(1, &entry);
+
+    CLUSTER_START;
+
+    /* The first server becomes candidate. */
+    STEP_UNTIL_CANDIDATE(0);
+    ASSERT_TIME(1000);
+
+    /* The second server receives a RequestVote RPC and rejects the vote for the
+     * first server. */
+    CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100);
+    ASSERT_VOTED_FOR(1, 0);
+    ASSERT_TIME(1015);
+
+    /* The first server receives the response and stays candidate. */
+    CLUSTER_STEP_UNTIL_DELIVERED(1, 0, 100);
+    ASSERT_CANDIDATE(0);
+    ASSERT_TIME(1030);
+
+    /* Eventually the second server becomes leader because it has a longer
+     * log. */
+    STEP_UNTIL_LEADER(1);
+    ASSERT_TIME(1130);
+
+    return MUNIT_OK;
+}
+
+static char *reject_not_voting_n[] = {"3", NULL};
+static char *reject_not_voting_n_voting[] = {"2", NULL};
+
+static MunitParameterEnum reject_not_voting_params[] = {
+    {CLUSTER_N_PARAM, reject_not_voting_n},
+    {CLUSTER_N_VOTING_PARAM, reject_not_voting_n_voting},
+    {NULL, NULL},
+};
+
+/* If we are not a voting server, the vote is not granted. */
+TEST(election, rejectIfNotVoter, setUp, tearDown, 0, reject_not_voting_params)
+{
+    struct fixture *f = data;
+
+    /* Disconnect server 0 from server 1, so server 0 can't win the elections
+     * (since there are only 2 voting servers). */
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+
+    CLUSTER_START;
+
+    /* Server 0 becomes candidate. */
+    STEP_UNTIL_CANDIDATE(0);
+    ASSERT_TIME(1000);
+
+    /* Server 0 stays candidate because it can't reach a quorum. */
+    CLUSTER_STEP_UNTIL_TERM_IS(0, 3, 2000);
+    ASSERT_CANDIDATE(0);
+    ASSERT_TIME(2000);
+
+    return MUNIT_OK;
+}
+
+/* If a candidate server receives a response indicating that the vote was not
+ * granted, nothing happens (e.g. the server has already voted for someone
+ * else). */
+TEST(election, receiveRejectResult, setUp, tearDown, 0, cluster_5_params)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Lower the randomized election timeout of server 4, so it becomes
+     * candidate just after server 0 */
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 4, 1020);
+
+    /* Disconnect server 0 from all others except server 1. */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    CLUSTER_SATURATE_BOTHWAYS(0, 3);
+    CLUSTER_SATURATE_BOTHWAYS(0, 4);
+
+    /* Disconnect server 4 from all others except the server 1. */
+    CLUSTER_SATURATE_BOTHWAYS(4, 0);
+    CLUSTER_SATURATE_BOTHWAYS(4, 2);
+    CLUSTER_SATURATE_BOTHWAYS(4, 3);
+
+    CLUSTER_START;
+
+    /* The server 0 becomes candidate, server 4 one is still follower. */
+    STEP_UNTIL_CANDIDATE(0);
+    ASSERT_TIME(1000);
+    ASSERT_FOLLOWER(4);
+
+    /* Server 1 receives a RequestVote RPC and grants its vote. */
+    CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100);
+    ASSERT_TIME(1015);
+    ASSERT_VOTED_FOR(1, 1);
+    ASSERT_CANDIDATE(0);
+    ASSERT_FOLLOWER(4);
+
+    /* Disconnect server 0 from server 1, so it doesn't receive further
+     * messages. */
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+
+    /* Server 4 server eventually becomes candidate */
+    STEP_UNTIL_CANDIDATE(4);
+    ASSERT_TIME(1100);
+    ASSERT_CANDIDATE(0);
+
+    /* The second server receives a RequestVote RPC but rejects its vote since
+     * it has already voted. */
+    CLUSTER_STEP_UNTIL_DELIVERED(4, 0, 100);
+    ASSERT_VOTED_FOR(1, 1);
+    ASSERT_CANDIDATE(0);
+    ASSERT_CANDIDATE(4);
+
+    return MUNIT_OK;
+}
+
+/* An I/O error occurs when converting to candidate. */
+TEST(election, ioErrorConvertTerm, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_START;
+
+    raft_fixture_term_fault(&f->cluster, 0, 0);
+    CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_UNAVAILABLE, 2000);
+
+    return MUNIT_OK;
+}
+
+/* An I/O error occurs when converting to candidate. */
+TEST(election, ioErrorConvertVote, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_START;
+
+    raft_fixture_vote_fault(&f->cluster, 0, 0);
+    CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_UNAVAILABLE, 2000);
+
+    return MUNIT_OK;
+}
+
+/* The I/O error occurs when sending a vote request, and gets ignored. */
+TEST(election, ioErrorSendVoteRequest, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_START;
+
+    /* The first server fails to send a RequestVote RPC. */
+    raft_fixture_send_fault(&f->cluster, 0, 0);
+    CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_LEADER, 5000);
+
+    return MUNIT_OK;
+}
+
+/* The I/O error occurs when the second node tries to persist its vote. */
+TEST(election, ioErrorPersistVote, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_START;
+
+    /* The first server becomes candidate. */
+    CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE, 2000);
+
+    /* The second server receives a RequestVote RPC but fails to persist its
+     * vote. */
+    raft_fixture_vote_fault(&f->cluster, 1, 0);
+    CLUSTER_STEP_UNTIL_STATE_IS(1, RAFT_UNAVAILABLE, 1000);
+
+    return MUNIT_OK;
+}
+
+/* Test an election round with two voters and pre-vote. */
+TEST(election, preVote, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    raft_set_pre_vote(CLUSTER_RAFT(0), true);
+    raft_set_pre_vote(CLUSTER_RAFT(1), true);
+    CLUSTER_START;
+
+    /* The first server eventually times out and converts to candidate, but it
+     * does not increment its term yet.*/
+    STEP_UNTIL_CANDIDATE(0);
+    ASSERT_TIME(1000);
+    ASSERT_TERM(0, 1);
+
+    CLUSTER_STEP; /* Server 1 tick */
+    ASSERT_FOLLOWER(1);
+
+    CLUSTER_STEP; /* Server 0 completes sending a pre-vote RequestVote RPC */
+    CLUSTER_STEP; /* Server 1 receives the pre-vote RequestVote RPC */
+    ASSERT_TERM(1, 1);      /* Server 1 does increment its term */
+    ASSERT_VOTED_FOR(1, 0); /* Server 1 does not persist its vote */
+    ASSERT_TIME(1015);
+
+    CLUSTER_STEP; /* Server 1 completes sending pre-vote RequestVote result */
+    CLUSTER_STEP; /* Server 0 receives the pre-vote RequestVote result */
+    ASSERT_CANDIDATE(0);
+    ASSERT_TERM(0, 2); /* Server 0 has now incremented its term. */
+    ASSERT_TIME(1030);
+
+    CLUSTER_STEP; /* Server 1 completes sending an actual RequestVote RPC */
+    CLUSTER_STEP; /* Server 1 receives the actual RequestVote RPC */
+    ASSERT_TERM(1, 2);      /* Server 1 does increment its term. */
+    ASSERT_VOTED_FOR(1, 1); /* Server 1 does persists its vote */
+
+    CLUSTER_STEP; /* Server 1 completes sending actual RequestVote result */
+    CLUSTER_STEP; /* Server 0 receives the actual RequestVote result */
+    ASSERT_LEADER(0);
+
+    return MUNIT_OK;
+}
+
+/* A candidate receives votes then crashes. */
+TEST(election, preVoteWithcandidateCrash, setUp, tearDown, 0, cluster_3_params)
+{
+    struct fixture *f = data;
+    raft_set_pre_vote(CLUSTER_RAFT(0), true);
+    raft_set_pre_vote(CLUSTER_RAFT(1), true);
+    raft_set_pre_vote(CLUSTER_RAFT(2), true);
+    CLUSTER_START;
+
+    /* The first server eventually times out and converts to candidate, but it
+     * does not increment its term yet.*/
+    STEP_UNTIL_CANDIDATE(0);
+    ASSERT_TIME(1000);
+    ASSERT_TERM(0, 1);
+
+    /* Server 1 and 2 ticks */
+    CLUSTER_STEP_N(2);
+    ASSERT_FOLLOWER(1);
+    ASSERT_FOLLOWER(2);
+
+    /* Server 0 completes sending a pre-vote RequestVote RPCs */
+    CLUSTER_STEP_N(2);
+
+    CLUSTER_STEP;           /* Server 1 receives the pre-vote RequestVote RPC */
+    ASSERT_TERM(1, 1);      /* Server 1 does not increment its term */
+    ASSERT_VOTED_FOR(1, 0); /* Server 1 does not persist its vote */
+    ASSERT_TIME(1015);
+
+    CLUSTER_STEP;           /* Server 2 receives the pre-vote RequestVote RPC */
+    ASSERT_TERM(2, 1);      /* Server 2 does not increment its term */
+    ASSERT_VOTED_FOR(2, 0); /* Server 1 does not persist its vote */
+    ASSERT_TIME(1015);
+
+    /* Server 1 and 2 complete sending pre-vote RequestVote results */
+    CLUSTER_STEP_N(2);
+
+    /* Server 0 receives the pre-vote RequestVote results */
+    CLUSTER_STEP_N(2);
+    ASSERT_CANDIDATE(0);
+    ASSERT_TERM(0, 2); /* Server 0 has now incremented its term. */
+    ASSERT_TIME(1030);
+
+    /* Server 0 completes sending actual RequestVote RPCs */
+    CLUSTER_STEP_N(2);
+
+    CLUSTER_STEP;           /* Server 1 receives the actual RequestVote RPC */
+    ASSERT_TERM(1, 2);      /* Server 1 does increment its term. */
+    ASSERT_VOTED_FOR(1, 1); /* Server 1 does persists its vote */
+
+    CLUSTER_STEP;           /* Server 2 receives the actual RequestVote RPC */
+    ASSERT_TERM(2, 2);      /* Server 2 does increment its term. */
+    ASSERT_VOTED_FOR(2, 1); /* Server 2 does persists its vote */
+
+    /* Server 0 crashes. */
+    CLUSTER_KILL(0);
+
+    /* Server 1 times out and starts an election.
+     * It doesn't increment its term */
+    STEP_UNTIL_CANDIDATE(1);
+    ASSERT_TIME(2200);
+    ASSERT_TERM(1, 2);
+
+    /* Server 1 completes sending the pre-vote RequestVote RPCs and server 2 has
+     * received those RPCs.
+     * Since server 2 has no current leader (the leader crashed before sending a
+     * HeartBeat), it will grant its vote to server 1, but will not persist it
+     * due to pre-vote, it's persisted vote is still for Server 0 (id 1) */
+    CLUSTER_STEP_N(5);
+    ASSERT_TERM(2, 2); /* Server 2 does not increment its term */
+    ASSERT_VOTED_FOR(2, 1);
+
+    /* Server 1 receives the pre-vote RequestVote Result */
+    CLUSTER_STEP_N(2);
+    /* Server 1 increments it's term to start a non pre-vote election */
+    ASSERT_TERM(1, 3);      /* Server 1 has now incremented its term. */
+    ASSERT_VOTED_FOR(1, 2); /* Server 1 has persisted its vote */
+    ASSERT_TIME(2230);
+
+    /* Server 1 completes sending actual RequestVote RPCs */
+    CLUSTER_STEP_N(2);
+    /* Server 2 receives the actual RequestVote RPCs */
+    CLUSTER_STEP_N(2);
+    ASSERT_VOTED_FOR(2, 2); /* Server 2 persists its vote */
+
+    /* Server 1 receives RequestVote RPCs results and becomes leader */
+    CLUSTER_STEP_N(2);
+    ASSERT_LEADER(1);
+    return MUNIT_OK;
+}
+
+/* Ensure delayed pre-vote responses are not counted towards the real election
+ * quorum. */
+TEST(election, preVoteNoStaleVotes, setUp, tearDown, 0, cluster_3_params)
+{
+    struct fixture *f = data;
+    raft_set_pre_vote(CLUSTER_RAFT(0), true);
+    raft_set_pre_vote(CLUSTER_RAFT(1), true);
+    raft_set_pre_vote(CLUSTER_RAFT(2), true);
+
+    /* Server 2 is 1 term ahead of the other servers, this will allow it to send
+     * stale pre-vote responses that pass the term checks. */
+    CLUSTER_SET_TERM(2, 2);
+    CLUSTER_START;
+
+    /* The first server eventually times out and converts to candidate, but it
+     * does not increment its term yet.*/
+    STEP_UNTIL_CANDIDATE(0);
+    ASSERT_TIME(1000);
+    ASSERT_TERM(0, 1);
+
+    /* Server 1 and 2 ticks */
+    CLUSTER_STEP_N(2);
+    ASSERT_FOLLOWER(1);
+    ASSERT_FOLLOWER(2);
+
+    /* Server 0 completes sending a pre-vote RequestVote RPCs */
+    CLUSTER_STEP_N(2);
+
+    CLUSTER_STEP;           /* Server 1 receives the pre-vote RequestVote RPC */
+    ASSERT_TERM(1, 1);      /* Server 1 does not increment its term */
+    ASSERT_VOTED_FOR(1, 0); /* Server 1 does not persist its vote */
+    ASSERT_TIME(1015);
+
+    CLUSTER_STEP;           /* Server 2 receives the pre-vote RequestVote RPC */
+    ASSERT_TERM(2, 2);      /* Server 2 does not increment its term */
+    ASSERT_VOTED_FOR(2, 0); /* Server 1 does not persist its vote */
+    ASSERT_TIME(1015);
+
+    /* Slow down responses of Server 2 */
+    CLUSTER_SET_NETWORK_LATENCY(2, 100);
+
+    /* Server 1 completes sending pre-vote RequestVote results */
+    CLUSTER_STEP_N(2);
+
+    /* Server 0 receives the pre-vote RequestVote results */
+    CLUSTER_STEP_N(2);
+    ASSERT_CANDIDATE(0);
+    ASSERT_TERM(0, 2); /* Server 0 has now incremented its term. */
+    ASSERT_TIME(1030);
+
+    /* Don't send messages from 0, this ensures no real RequestVote RPCs are
+     * sent */
+    CLUSTER_SATURATE(0, 1);
+    CLUSTER_SATURATE(0, 2);
+
+    /* Wait until all messages from 2 to 0 are delivered */
+    CLUSTER_STEP_UNTIL_DELIVERED(2, 0, 100);
+
+    /* Make sure we haven't counted the pre-vote result as a real vote */
+    ASSERT_CANDIDATE(0);
+    return MUNIT_OK;
+}
+
+/* A follower doesn't convert to candidate while waiting for log entries to be
+ * persisted. */
+TEST(election, inFlightAppendBlocksCandidacy, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply req;
+
+    /* Server 1 takes a long time to persist entries. */
+    CLUSTER_SET_DISK_LATENCY(1, 10000);
+
+    CLUSTER_START;
+
+    /* Server 0 is the leader. It replicates a log entry. */
+    CLUSTER_ELECT(0);
+    CLUSTER_APPLY_ADD_X(0, &req, 1, NULL);
+
+    /* Server 1 receives the entry. */
+    CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 1000);
+
+    /* Contact is lost between servers 0 and 1. */
+    CLUSTER_SATURATE(0, 1);
+    CLUSTER_SATURATE(1, 0);
+
+    /* Several election timeouts lapse, but server 1 does not become a
+     * candidate, because it's waiting for the entry to be persisted. */
+    CLUSTER_STEP_UNTIL_ELAPSED(5000);
+    munit_assert_int(CLUSTER_STATE(1), ==, RAFT_FOLLOWER);
+
+    /* Eventually, server 1 finishes persisting the entry and becomes a
+     * candidate. */
+    CLUSTER_STEP_UNTIL_STATE_IS(1, RAFT_CANDIDATE, 10000);
+
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_fixture.c b/test/raft/integration/test_fixture.c
new file mode 100644
index 000000000..c693ea273
--- /dev/null
+++ b/test/raft/integration/test_fixture.c
@@ -0,0 +1,306 @@
+#include "../../../src/raft.h"
+#include "../lib/fsm.h"
+#include "../lib/heap.h"
+#include "../lib/runner.h"
+
+#define N_SERVERS 3
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_HEAP;
+    struct raft_fsm fsms[N_SERVERS];
+    struct raft_fixture fixture;
+};
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_calloc(1, sizeof *f);
+    struct raft_configuration configuration;
+    unsigned i;
+    int rc;
+    SET_UP_HEAP;
+    for (i = 0; i < N_SERVERS; i++) {
+        FsmInit(&f->fsms[i], 2);
+    }
+
+    rc = raft_fixture_init(&f->fixture);
+    munit_assert_int(rc, ==, 0);
+
+    for (i = 0; i < N_SERVERS; i++) {
+        rc = raft_fixture_grow(&f->fixture, &f->fsms[i]);
+        munit_assert_int(rc, ==, 0);
+    }
+
+    rc = raft_fixture_configuration(&f->fixture, N_SERVERS, &configuration);
+    munit_assert_int(rc, ==, 0);
+
+    rc = raft_fixture_bootstrap(&f->fixture, &configuration);
+    munit_assert_int(rc, ==, 0);
+
+    raft_configuration_close(&configuration);
+
+    rc = raft_fixture_start(&f->fixture);
+    munit_assert_int(rc, ==, 0);
+
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    unsigned i;
+    raft_fixture_close(&f->fixture);
+    for (i = 0; i < N_SERVERS; i++) {
+        FsmClose(&f->fsms[i]);
+    }
+    TEAR_DOWN_HEAP;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+#define GET(I) raft_fixture_get(&f->fixture, I)
+#define STEP raft_fixture_step(&f->fixture)
+#define STEP_N(N) raft_fixture_step_n(&f->fixture, N)
+#define STEP_UNTIL_STATE_IS(I, STATE)                                          \
+    {                                                                          \
+        bool done_;                                                            \
+        done_ = raft_fixture_step_until_state_is(&f->fixture, I, STATE, 2000); \
+        munit_assert_true(done_);                                              \
+    }
+#define STATE(I) raft_state(GET(I))
+#define ELECT(I) raft_fixture_elect(&f->fixture, I)
+#define DEPOSE raft_fixture_depose(&f->fixture)
+#define APPLY(I, REQ)                                \
+    {                                                \
+        struct raft_buffer buf;                      \
+        int rc;                                      \
+        FsmEncodeAddX(1, &buf);                      \
+        rc = raft_apply(GET(I), REQ, &buf, 1, NULL); \
+        munit_assert_int(rc, ==, 0);                 \
+    }
+#define STEP_UNTIL_APPLIED(INDEX) \
+    raft_fixture_step_until_applied(&f->fixture, N_SERVERS, INDEX, INDEX * 1000)
+
+/******************************************************************************
+ *
+ * Assertions
+ *
+ *****************************************************************************/
+
+/* Assert that the fixture time matches the given value */
+#define ASSERT_TIME(TIME) \
+    munit_assert_int(raft_fixture_time(&f->fixture), ==, TIME)
+
+/* Assert that the I'th server is in the given state. */
+#define ASSERT_STATE(I, S) munit_assert_int(STATE(I), ==, S)
+
+/* Assert that the x field of the FSM with the given index matches the given
+ * value. */
+#define ASSERT_FSM_X(I, VALUE) munit_assert_int(FsmGetX(&f->fsms[I]), ==, VALUE)
+
+/******************************************************************************
+ *
+ * raft_fixture_step
+ *
+ *****************************************************************************/
+
+SUITE(raft_fixture_step)
+
+/* If there is no disk I/O in progress or network messages in flight, the tick
+ * callbacks are called. */
+TEST(raft_fixture_step, tick, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_fixture_event *event;
+    (void)params;
+
+    ASSERT_TIME(0);
+
+    event = STEP;
+    munit_assert_int(raft_fixture_event_server_index(event), ==, 0);
+    munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK);
+    ASSERT_TIME(100);
+
+    event = STEP;
+    munit_assert_int(raft_fixture_event_server_index(event), ==, 1);
+    munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK);
+    ASSERT_TIME(100);
+
+    event = STEP;
+    munit_assert_int(raft_fixture_event_server_index(event), ==, 2);
+    munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK);
+    ASSERT_TIME(100);
+
+    event = STEP;
+    munit_assert_int(raft_fixture_event_server_index(event), ==, 0);
+    munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK);
+    ASSERT_TIME(200);
+
+    return MUNIT_OK;
+}
+
+/* By default the election timeout of server 0 is the first to expire . */
+TEST(raft_fixture_step, electionTimeout, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_fixture_event *event;
+    (void)params;
+    event = STEP_N(28);
+    munit_assert_int(raft_fixture_event_server_index(event), ==, 0);
+    munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_TICK);
+    ASSERT_TIME(1000);
+    ASSERT_STATE(0, RAFT_CANDIDATE);
+    ASSERT_STATE(1, RAFT_FOLLOWER);
+    ASSERT_STATE(2, RAFT_FOLLOWER);
+    munit_log(MUNIT_LOG_INFO, "done");
+    return MUNIT_OK;
+}
+
+/* Send requests are flushed immediately. */
+TEST(raft_fixture_step, flushSend, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_fixture_event *event;
+    (void)params;
+    STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE);
+    event = STEP;
+    munit_assert_int(raft_fixture_event_server_index(event), ==, 0);
+    munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_NETWORK);
+    ASSERT_TIME(1000);
+    event = STEP;
+    munit_assert_int(raft_fixture_event_server_index(event), ==, 0);
+    munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_NETWORK);
+    ASSERT_TIME(1000);
+    return MUNIT_OK;
+}
+
+/* Messages are delivered according to the current network latency. */
+TEST(raft_fixture_step, deliver, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_fixture_event *event;
+    (void)params;
+    STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE); /* Server 0 starts election */
+    STEP_N(2);                              /* Server 0 sends 2 RequestVote */
+    STEP_N(2);                              /* Ticks for server 1 and 2 */
+    ASSERT_TIME(1000);
+    event = STEP;
+    munit_assert_int(raft_fixture_event_server_index(event), ==, 0);
+    munit_assert_int(raft_fixture_event_type(event), ==, RAFT_FIXTURE_NETWORK);
+    ASSERT_TIME(1015);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * raft_fixture_elect
+ *
+ *****************************************************************************/
+
+SUITE(raft_fixture_elect)
+
+/* Trigger the election of the first server. */
+TEST(raft_fixture_elect, first, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ELECT(0);
+    ASSERT_STATE(0, RAFT_LEADER);
+    ASSERT_STATE(1, RAFT_FOLLOWER);
+    ASSERT_STATE(2, RAFT_FOLLOWER);
+    return MUNIT_OK;
+}
+
+/* Trigger the election of the second server. */
+TEST(raft_fixture_elect, second, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ELECT(1);
+    ASSERT_STATE(0, RAFT_FOLLOWER);
+    ASSERT_STATE(1, RAFT_LEADER);
+    ASSERT_STATE(2, RAFT_FOLLOWER);
+    return MUNIT_OK;
+}
+
+/* Trigger an election change. */
+TEST(raft_fixture_elect, change, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ELECT(0);
+    DEPOSE;
+    ASSERT_STATE(0, RAFT_FOLLOWER);
+    ASSERT_STATE(1, RAFT_FOLLOWER);
+    ASSERT_STATE(2, RAFT_FOLLOWER);
+    ELECT(1);
+    ASSERT_STATE(0, RAFT_FOLLOWER);
+    ASSERT_STATE(1, RAFT_LEADER);
+    ASSERT_STATE(2, RAFT_FOLLOWER);
+    return MUNIT_OK;
+}
+
+/* Trigger an election that re-elects the same node. */
+TEST(raft_fixture_elect, again, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ELECT(0);
+    DEPOSE;
+    ASSERT_STATE(0, RAFT_FOLLOWER);
+    ASSERT_STATE(1, RAFT_FOLLOWER);
+    ASSERT_STATE(2, RAFT_FOLLOWER);
+    ELECT(0);
+    ASSERT_STATE(0, RAFT_LEADER);
+    ASSERT_STATE(1, RAFT_FOLLOWER);
+    ASSERT_STATE(2, RAFT_FOLLOWER);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * raft_fixture_step_until_applied
+ *
+ *****************************************************************************/
+
+SUITE(raft_fixture_step_until_applied)
+
+/* Wait for one entry to be applied. */
+TEST(raft_fixture_step_until_applied, one, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply *req = munit_malloc(sizeof *req);
+    ELECT(0);
+    APPLY(0, req);
+    STEP_UNTIL_APPLIED(3);
+    ASSERT_FSM_X(0, 1);
+    ASSERT_FSM_X(1, 1);
+    ASSERT_FSM_X(2, 1);
+    free(req);
+    return MUNIT_OK;
+}
+
+/* Wait for two entries to be applied. */
+TEST(raft_fixture_step_until_applied, two, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply *req1 = munit_malloc(sizeof *req1);
+    struct raft_apply *req2 = munit_malloc(sizeof *req2);
+    ELECT(0);
+    APPLY(0, req1);
+    APPLY(0, req2);
+    STEP_UNTIL_APPLIED(4);
+    ASSERT_FSM_X(0, 2);
+    ASSERT_FSM_X(1, 2);
+    ASSERT_FSM_X(2, 2);
+    free(req1);
+    free(req2);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_heap.c b/test/raft/integration/test_heap.c
new file mode 100644
index 000000000..a6265cbeb
--- /dev/null
+++ b/test/raft/integration/test_heap.c
@@ -0,0 +1,53 @@
+#include "../../../src/raft.h"
+
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Default heap functions
+ *
+ *****************************************************************************/
+
+SUITE(raft_heap)
+
+TEST(raft_heap, malloc, NULL, NULL, 0, NULL)
+{
+    void *p;
+    p = raft_malloc(8);
+    munit_assert_ptr_not_null(p);
+    raft_free(p);
+    return MUNIT_OK;
+}
+
+TEST(raft_heap, calloc, NULL, NULL, 0, NULL)
+{
+    void *p;
+    p = raft_calloc(1, 8);
+    munit_assert_ptr_not_null(p);
+    munit_assert_int(*(uint64_t *)p, ==, 0);
+    raft_free(p);
+    return MUNIT_OK;
+}
+
+TEST(raft_heap, realloc, NULL, NULL, 0, NULL)
+{
+    void *p;
+    p = raft_realloc(NULL, 8);
+    munit_assert_ptr_not_null(p);
+    *(uint64_t *)p = 1;
+    p = raft_realloc(p, 16);
+    munit_assert_ptr_not_null(p);
+    munit_assert_int(*(uint64_t *)p, ==, 1);
+    raft_free(p);
+    return MUNIT_OK;
+}
+
+TEST(raft_heap, aligned_alloc, NULL, NULL, 0, NULL)
+{
+    void *p;
+    p = raft_aligned_alloc(1024, 2048);
+    munit_assert_ptr_not_null(p);
+    munit_assert_int((uintptr_t)p % 1024, ==, 0);
+    raft_free(p);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_init.c b/test/raft/integration/test_init.c
new file mode 100644
index 000000000..512864d2c
--- /dev/null
+++ b/test/raft/integration/test_init.c
@@ -0,0 +1,85 @@
+#include "../../../src/raft.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * raft_init
+ *
+ *****************************************************************************/
+
+SUITE(raft_init)
+
+/* Incompatible raft->io and raft->fsm wrt async snapshots. */
+TEST(raft_init, incompatIoFsmAsyncSnapshotNotNull, NULL, NULL, 0, NULL)
+{
+    /* Set incompatible io and fsm versions and non-NULL snapshot_async fn */
+    struct raft r = {0};
+    struct raft_io io = {0};
+    struct raft_fsm fsm = {0};
+    io.version = 1; /* Too low */
+    io.async_work = (int (*)(struct raft_io *, struct raft_io_async_work *,
+                             raft_io_async_work_cb))(uintptr_t)0xDEADBEEF;
+    fsm.version = 3;
+    fsm.snapshot_async = (int (*)(struct raft_fsm *, struct raft_buffer **,
+                                  unsigned int *))(uintptr_t)0xDEADBEEF;
+
+    int rc;
+    rc = raft_init(&r, &io, &fsm, 1, "1");
+    munit_assert_int(rc, ==, -1);
+    munit_assert_string_equal(
+        r.errmsg,
+        "async snapshot requires io->version > 1 and async_work method.");
+    return MUNIT_OK;
+}
+
+/* Incompatible raft->io and raft->fsm wrt async snapshots. */
+TEST(raft_init, incompatIoFsmAsyncSnapshotNull, NULL, NULL, 0, NULL)
+{
+    /* Set incompatible io and fsm versions and NULL snapshot_async fn */
+    struct raft r = {0};
+    struct raft_io io = {0};
+    struct raft_fsm fsm = {0};
+    io.version = 2;
+    io.async_work = NULL;
+    fsm.version = 3;
+    fsm.snapshot_async = (int (*)(struct raft_fsm *, struct raft_buffer **,
+                                  unsigned int *))(uintptr_t)0xDEADBEEF;
+
+    int rc;
+    rc = raft_init(&r, &io, &fsm, 1, "1");
+    munit_assert_int(rc, ==, -1);
+    munit_assert_string_equal(
+        r.errmsg,
+        "async snapshot requires io->version > 1 and async_work method.");
+    return MUNIT_OK;
+}
+
+TEST(raft_init, ioVersionNotSet, NULL, NULL, 0, NULL)
+{
+    struct raft r = {0};
+    struct raft_io io = {0};
+    struct raft_fsm fsm = {0};
+    io.version = 0;
+    fsm.version = 3;
+
+    int rc;
+    rc = raft_init(&r, &io, &fsm, 1, "1");
+    munit_assert_int(rc, ==, -1);
+    munit_assert_string_equal(r.errmsg, "io->version must be set");
+    return MUNIT_OK;
+}
+
+TEST(raft_init, fsmVersionNotSet, NULL, NULL, 0, NULL)
+{
+    struct raft r = {0};
+    struct raft_io io = {0};
+    struct raft_fsm fsm = {0};
+    io.version = 2;
+    fsm.version = 0;
+
+    int rc;
+    rc = raft_init(&r, &io, &fsm, 1, "1");
+    munit_assert_int(rc, ==, -1);
+    munit_assert_string_equal(r.errmsg, "fsm->version must be set");
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_membership.c b/test/raft/integration/test_membership.c
new file mode 100644
index 000000000..53d43aea9
--- /dev/null
+++ b/test/raft/integration/test_membership.c
@@ -0,0 +1,317 @@
+#include "../../../src/raft/configuration.h"
+#include "../../../src/raft/progress.h"
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+    struct raft_change req;
+};
+
+/* Set up a cluster of 2 servers, with the first as leader. */
+static void *setup(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(2);
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+    CLUSTER_ELECT(0);
+    return f;
+}
+
+static void tear_down(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+/* Add a an empty server to the cluster and start it. */
+#define GROW                                \
+    {                                       \
+        int rv__;                           \
+        CLUSTER_GROW;                       \
+        rv__ = raft_start(CLUSTER_RAFT(2)); \
+        munit_assert_int(rv__, ==, 0);      \
+    }
+
+/* Invoke raft_add against the I'th node and assert it returns the given
+ * value. */
+#define ADD(I, ID, RV)                                                \
+    {                                                                 \
+        int rv_;                                                      \
+        char address_[16];                                            \
+        sprintf(address_, "%d", ID);                                  \
+        rv_ = raft_add(CLUSTER_RAFT(I), &f->req, ID, address_, NULL); \
+        munit_assert_int(rv_, ==, RV);                                \
+    }
+
+/* Submit a request to assign the given ROLE to the server with the given ID. */
+#define ASSIGN(I, ID, ROLE)                                          \
+    {                                                                \
+        int _rv;                                                     \
+        _rv = raft_assign(CLUSTER_RAFT(I), &f->req, ID, ROLE, NULL); \
+        munit_assert_int(_rv, ==, 0);                                \
+    }
+
+/* Invoke raft_remove against the I'th node and assert it returns the given
+ * value. */
+#define REMOVE(I, ID, RV)                                      \
+    {                                                          \
+        int rv_;                                               \
+        rv_ = raft_remove(CLUSTER_RAFT(I), &f->req, ID, NULL); \
+        munit_assert_int(rv_, ==, RV);                         \
+    }
+
+struct result
+{
+    int status;
+    bool done;
+};
+
+/* Submit an apply request. */
+#define APPLY_SUBMIT(I)                                       \
+    struct raft_buffer _buf;                                  \
+    struct raft_apply _req;                                   \
+    struct result _result = {0, false};                       \
+    int _rv;                                                  \
+    FsmEncodeSetX(123, &_buf);                                \
+    _req.data = &_result;                                     \
+    _rv = raft_apply(CLUSTER_RAFT(I), &_req, &_buf, 1, NULL); \
+    munit_assert_int(_rv, ==, 0);
+
+/******************************************************************************
+ *
+ * Assertions
+ *
+ *****************************************************************************/
+
+/* Assert the values of the committed and uncommitted configuration indexes on
+ * the raft instance with the given index. */
+#define ASSERT_CONFIGURATION_INDEXES(I, COMMITTED, UNCOMMITTED)                \
+    {                                                                          \
+        struct raft *raft_ = CLUSTER_RAFT(I);                                  \
+        munit_assert_int(raft_->configuration_committed_index, ==, COMMITTED); \
+        munit_assert_int(raft_->configuration_uncommitted_index, ==,           \
+                         UNCOMMITTED);                                         \
+    }
+
+/******************************************************************************
+ *
+ * raft_add
+ *
+ *****************************************************************************/
+
+SUITE(raft_add)
+
+/* After a request to add a new non-voting server is committed, the new
+ * configuration is not marked as uncommitted anymore */
+TEST(raft_add, committed, setup, tear_down, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft = CLUSTER_RAFT(0);
+    const struct raft_server *server;
+    ADD(0 /*   I                                                     */,
+        3 /*   ID                                                    */, 0);
+
+    /* The new configuration is already effective. */
+    munit_assert_int(raft->configuration.n, ==, 3);
+    server = &raft->configuration.servers[2];
+    munit_assert_int(server->id, ==, 3);
+    munit_assert_string_equal(server->address, "3");
+    munit_assert_int(server->role, ==, RAFT_SPARE);
+
+    /* The new configuration is marked as uncommitted. */
+    ASSERT_CONFIGURATION_INDEXES(0, 1, 3);
+
+    /* The next/match indexes now include an entry for the new server. */
+    munit_assert_int(raft->leader_state.progress[2].next_index, ==, 4);
+    munit_assert_int(raft->leader_state.progress[2].match_index, ==, 0);
+
+    CLUSTER_STEP_UNTIL_APPLIED(0, 3, 2000);
+    ASSERT_CONFIGURATION_INDEXES(0, 3, 0);
+
+    /* The new configuration is marked as committed. */
+
+    return MUNIT_OK;
+}
+
+/* Trying to add a server on a node which is not the leader results in an
+ * error. */
+TEST(raft_add, notLeader, setup, tear_down, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1 /*   I                                                     */,
+        3 /*   ID                                                    */,
+        RAFT_NOTLEADER);
+    return MUNIT_OK;
+}
+
+/* Trying to add a server while a configuration change is already in progress
+ * results in an error. */
+TEST(raft_add, busy, setup, tear_down, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(0 /*   I                                                     */,
+        3 /*   ID                                                    */, 0);
+    ADD(0 /*   I                                                     */,
+        4 /*   ID                                                    */,
+        RAFT_CANTCHANGE);
+    munit_log(MUNIT_LOG_INFO, "done");
+    return MUNIT_OK;
+}
+
+/* Trying to add a server with an ID which is already in use results in an
+ * error. */
+TEST(raft_add, duplicateId, setup, tear_down, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(0 /*   I                                                     */,
+        2 /*   ID                                                    */,
+        RAFT_DUPLICATEID);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * raft_remove
+ *
+ *****************************************************************************/
+
+SUITE(raft_remove)
+
+/* After a request to remove server is committed, the new configuration is not
+ * marked as uncommitted anymore */
+TEST(raft_remove, committed, setup, tear_down, 0, NULL)
+{
+    struct fixture *f = data;
+    GROW;
+    ADD(0, 3, 0);
+    CLUSTER_STEP_UNTIL_APPLIED(0, 3, 2000);
+    ASSIGN(0, 3, RAFT_STANDBY);
+    CLUSTER_STEP_UNTIL_APPLIED(2, 2, 2000);
+    CLUSTER_STEP_N(2);
+    REMOVE(0, 3, 0);
+    ASSERT_CONFIGURATION_INDEXES(0, 4, 5);
+    CLUSTER_STEP_UNTIL_APPLIED(0, 5, 2000);
+    ASSERT_CONFIGURATION_INDEXES(0, 5, 0);
+    munit_assert_int(CLUSTER_RAFT(0)->configuration.n, ==, 2);
+    return MUNIT_OK;
+}
+
+/* A leader gets a request to remove itself. */
+TEST(raft_remove, self, setup, tear_down, 0, NULL)
+{
+    struct fixture *f = data;
+    REMOVE(0, 1, 0);
+    CLUSTER_STEP_UNTIL_APPLIED(0, 2, 2000);
+    CLUSTER_STEP_UNTIL_APPLIED(1, 2, 10000);
+    return MUNIT_OK;
+}
+
+/* A leader gets a request to remove itself from a 3-node cluster */
+TEST(raft_remove, selfThreeNodeClusterReplicate, setup, tear_down, 0, NULL)
+{
+    struct fixture *f = data;
+    /* Add a third node */
+    GROW;
+    ADD(0, 3, 0);
+    CLUSTER_STEP_UNTIL_APPLIED(0, 3, 2000);
+    ASSIGN(0, 3, RAFT_VOTER);
+    CLUSTER_STEP_UNTIL_APPLIED(0, 4, 2000);
+
+    /* Verify node with id 1 is the leader */
+    raft_id leader_id = 0xDEADBEEF;
+    const char *leader_address = NULL;
+    raft_leader(CLUSTER_RAFT(0), &leader_id, &leader_address);
+    munit_assert_ulong(leader_id, ==, 1);
+    munit_assert_ptr_not_null(leader_address);
+
+    /* The leader is requested to remove itself from the configuration */
+    REMOVE(0, 1, 0);
+
+    /* The - removed - leader should still replicate entries.
+     *
+     * Raft dissertation 4.2.2
+     * `First, there will be a period of time (while it is committing Cnew) when
+     * a leader can manage a cluster that does not include itself; it replicates
+     * log entries but does not count itself in majorities.`
+     *
+     * */
+    APPLY_SUBMIT(0)
+
+    /* The removed leader eventually steps down */
+    CLUSTER_STEP_UNTIL_HAS_NO_LEADER(5000);
+    raft_leader(CLUSTER_RAFT(0), &leader_id, &leader_address);
+    munit_assert_ulong(leader_id, ==, 0);
+    munit_assert_ptr_null(leader_address);
+
+    /* The original leader has applied the REMOVE entry */
+    CLUSTER_STEP_UNTIL_APPLIED(0, 5, 10000);
+
+    /* At this point the other nodes have replicated the new config, but have
+     * not yet applied it, they miss a heartbeat from the leader informing them
+     * of the commit index of the new config.*/
+
+    /* A new leader is elected */
+    CLUSTER_STEP_UNTIL_HAS_LEADER(5000);
+
+    /* The other nodes applied the barrier after
+     * the config change and therefore commit the new config . */
+    CLUSTER_STEP_UNTIL_APPLIED(1, 6, 10000);
+    CLUSTER_STEP_UNTIL_APPLIED(2, 6, 10000);
+
+    /* The removed leader doesn't know who the leader is */
+    raft_leader(CLUSTER_RAFT(0), &leader_id, &leader_address);
+    munit_assert_ulong(leader_id, ==, 0);
+    munit_assert_ptr_null(leader_address);
+
+    /* The new configuration has a leader */
+    raft_leader(CLUSTER_RAFT(1), &leader_id, &leader_address);
+    munit_assert_ulong(leader_id, !=, 0);
+    munit_assert_ulong(leader_id, !=, 1);
+    munit_assert_ptr_not_null(leader_address);
+    return MUNIT_OK;
+}
+
+/* Trying to remove a server on a node which is not the leader results in an
+ * error. */
+TEST(raft_remove, notLeader, setup, tear_down, 0, NULL)
+{
+    struct fixture *f = data;
+    REMOVE(1 /*   I                                                     */,
+           3 /*   ID                                                    */,
+           RAFT_NOTLEADER);
+    return MUNIT_OK;
+}
+
+/* Trying to remove a server while a configuration change is already in progress
+ * results in an error. */
+TEST(raft_remove, inProgress, setup, tear_down, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(0, 3, 0);
+    REMOVE(0, 3, RAFT_CANTCHANGE);
+    return MUNIT_OK;
+}
+
+/* Trying to remove a server with an unknown ID results in an error. */
+TEST(raft_remove, badId, setup, tear_down, 0, NULL)
+{
+    struct fixture *f = data;
+    REMOVE(0, 3, RAFT_BADID);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_recover.c b/test/raft/integration/test_recover.c
new file mode 100644
index 000000000..26f036857
--- /dev/null
+++ b/test/raft/integration/test_recover.c
@@ -0,0 +1,56 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture holding a bootstrapped raft cluster.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(3);
+    CLUSTER_BOOTSTRAP;
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Recover tests.
+ *
+ *****************************************************************************/
+
+SUITE(raft_recover)
+
+/* Attempting to recover a running instance results in RAFT_BUSY. */
+TEST(raft_recover, busy, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft;
+    struct raft_configuration configuration;
+    int rv;
+
+    /* Start all servers. */
+    CLUSTER_START;
+
+    raft = CLUSTER_RAFT(0);
+    CLUSTER_CONFIGURATION(&configuration);
+    rv = raft_recover(raft, &configuration);
+    munit_assert_int(rv, ==, RAFT_BUSY);
+    raft_configuration_close(&configuration);
+
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_replication.c b/test/raft/integration/test_replication.c
new file mode 100644
index 000000000..c971a78a7
--- /dev/null
+++ b/test/raft/integration/test_replication.c
@@ -0,0 +1,1280 @@
+#include "../../../src/raft/configuration.h"
+#include "../../../src/raft/flags.h"
+#include "../../../src/raft/progress.h"
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+/* Standard startup sequence, bootstrapping the cluster and electing server 0 */
+#define BOOTSTRAP_START_AND_ELECT \
+    CLUSTER_BOOTSTRAP;            \
+    CLUSTER_START;                \
+    CLUSTER_ELECT(0);             \
+    ASSERT_TIME(1045)
+
+/******************************************************************************
+ *
+ * Set up a cluster with a two servers.
+ *
+ *****************************************************************************/
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(2);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Assertions
+ *
+ *****************************************************************************/
+
+/* Assert that the I'th server is in follower state. */
+#define ASSERT_FOLLOWER(I) munit_assert_int(CLUSTER_STATE(I), ==, RAFT_FOLLOWER)
+
+/* Assert that the I'th server is in candidate state. */
+#define ASSERT_CANDIDATE(I) \
+    munit_assert_int(CLUSTER_STATE(I), ==, RAFT_CANDIDATE)
+
+/* Assert that the I'th server is in leader state. */
+#define ASSERT_LEADER(I) munit_assert_int(CLUSTER_STATE(I), ==, RAFT_LEADER)
+
+/* Assert that the fixture time matches the given value */
+#define ASSERT_TIME(TIME) munit_assert_int(CLUSTER_TIME, ==, TIME)
+
+/* Assert that the configuration of the I'th server matches the given one */
+#define ASSERT_CONFIGURATION(I, EXPECTED)                                    \
+    do {                                                                     \
+        struct raft *_raft = CLUSTER_RAFT(I);                                \
+        struct raft_configuration *_actual = &_raft->configuration;          \
+        unsigned _i;                                                         \
+                                                                             \
+        munit_assert_uint(_actual->n, ==, (EXPECTED)->n);                    \
+        for (_i = 0; _i < _actual->n; _i++) {                                \
+            struct raft_server *_server1 = &_actual->servers[_i];            \
+            struct raft_server *_server2 = &(EXPECTED)->servers[_i];         \
+            munit_assert_ulong(_server1->id, ==, _server2->id);              \
+            munit_assert_int(_server1->role, ==, _server2->role);            \
+            munit_assert_string_equal(_server1->address, _server2->address); \
+        }                                                                    \
+    } while (0)
+
+/******************************************************************************
+ *
+ * Log replication.
+ *
+ *****************************************************************************/
+
+SUITE(replication)
+
+/* A leader sends a heartbeat message as soon as it gets elected. */
+TEST(replication, sendInitialHeartbeat, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft;
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+
+    /* Server 0 becomes candidate and sends vote requests after the election
+     * timeout. */
+    CLUSTER_STEP_N(19);
+    ASSERT_TIME(1000);
+    ASSERT_CANDIDATE(0);
+
+    /* Server 0 receives the vote result, becomes leader and sends
+     * heartbeats. */
+    CLUSTER_STEP_N(6);
+    ASSERT_LEADER(0);
+    ASSERT_TIME(1030);
+    raft = CLUSTER_RAFT(0);
+    munit_assert_int(raft->leader_state.progress[1].last_send, ==, 1030);
+
+    /* Server 1 receives the heartbeat from server 0 and resets its election
+     * timer. */
+    raft = CLUSTER_RAFT(1);
+    munit_assert_int(raft->election_timer_start, ==, 1015);
+    CLUSTER_STEP_N(2);
+    munit_assert_int(raft->election_timer_start, ==, 1045);
+
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1);
+    munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 1);
+
+    return MUNIT_OK;
+}
+
+/* After receiving an AppendEntriesResult, a leader has set the feature flags of
+ * a node. */
+TEST(replication, receiveFlags, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft;
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+
+    /* Server 0 becomes leader and sends the initial heartbeat. */
+    CLUSTER_STEP_N(24);
+    ASSERT_LEADER(0);
+    ASSERT_TIME(1030);
+
+    /* Flags is empty */
+    raft = CLUSTER_RAFT(0);
+    munit_assert_ullong(raft->leader_state.progress[1].features, ==, 0);
+
+    raft = CLUSTER_RAFT(1);
+    /* Server 1 receives the first heartbeat. */
+    CLUSTER_STEP_N(4);
+    munit_assert_int(raft->election_timer_start, ==, 1045);
+    munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 1);
+
+    /* Server 0 receives the reply to the heartbeat. */
+    CLUSTER_STEP_N(2);
+    munit_assert_int(CLUSTER_N_RECV(0, RAFT_IO_APPEND_ENTRIES_RESULT), ==, 1);
+    raft = CLUSTER_RAFT(0);
+    munit_assert_ullong(raft->leader_state.progress[1].features, ==,
+                        RAFT_DEFAULT_FEATURE_FLAGS);
+
+    return MUNIT_OK;
+}
+
+/* A leader keeps sending heartbeat messages at regular intervals to
+ * maintain leadership. */
+TEST(replication, sendFollowupHeartbeat, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft;
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+
+    /* Server 0 becomes leader and sends the initial heartbeat. */
+    CLUSTER_STEP_N(24);
+    ASSERT_LEADER(0);
+    ASSERT_TIME(1030);
+
+    raft = CLUSTER_RAFT(1);
+
+    /* Server 1 receives the first heartbeat. */
+    CLUSTER_STEP_N(4);
+    munit_assert_int(raft->election_timer_start, ==, 1045);
+    munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 1);
+
+    /* Server 1 receives the second heartbeat. */
+    CLUSTER_STEP_N(8);
+    munit_assert_int(raft->election_timer_start, ==, 1215);
+    munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 2);
+
+    /* Server 1 receives the third heartbeat. */
+    CLUSTER_STEP_N(7);
+    munit_assert_int(raft->election_timer_start, ==, 1315);
+    munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 3);
+
+    /* Server 1 receives the fourth heartbeat. */
+    CLUSTER_STEP_N(7);
+    munit_assert_int(raft->election_timer_start, ==, 1415);
+
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 4);
+    munit_assert_int(CLUSTER_N_RECV(0, RAFT_IO_APPEND_ENTRIES_RESULT), ==, 4);
+    munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 4);
+    munit_assert_int(CLUSTER_N_SEND(1, RAFT_IO_APPEND_ENTRIES_RESULT), ==, 4);
+
+    return MUNIT_OK;
+}
+
+/* If a leader replicates some entries during a given heartbeat interval, it
+ * skips sending the heartbeat for that interval. */
+TEST(replication, sendSkipHeartbeat, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft;
+    struct raft_apply req;
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+
+    raft = CLUSTER_RAFT(0);
+
+    /* Server 0 becomes leader and sends the first two heartbeats. */
+    CLUSTER_STEP_UNTIL_ELAPSED(1215);
+    ASSERT_LEADER(0);
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2);
+    munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 2);
+
+    /* Server 0 starts replicating a new entry after 15 milliseconds. */
+    CLUSTER_STEP_UNTIL_ELAPSED(15);
+    ASSERT_TIME(1230);
+    CLUSTER_APPLY_ADD_X(0, &req, 1, NULL);
+    CLUSTER_STEP_N(1);
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 3);
+    munit_assert_int(raft->leader_state.progress[1].last_send, ==, 1230);
+
+    /* When the heartbeat timeout expires, server 0 does not send an empty
+     * append entries. */
+    CLUSTER_STEP_UNTIL_ELAPSED(70);
+    ASSERT_TIME(1300);
+    CLUSTER_STEP_N(1);
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 3);
+    munit_assert_int(raft->leader_state.progress[1].last_send, ==, 1230);
+
+    return MUNIT_OK;
+}
+
+/* The leader doesn't send replication messages to idle servers. */
+TEST(replication, skipIdle, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_change req1;
+    struct raft_apply req2;
+    BOOTSTRAP_START_AND_ELECT;
+    CLUSTER_ADD(&req1);
+    CLUSTER_STEP_UNTIL_APPLIED(0, 3, 1000);
+    CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, &req2, 1, NULL);
+    CLUSTER_STEP_UNTIL_ELAPSED(1000);
+    munit_assert_int(CLUSTER_LAST_APPLIED(0), ==, 4);
+    munit_assert_int(CLUSTER_LAST_APPLIED(1), ==, 4);
+    munit_assert_int(CLUSTER_LAST_APPLIED(2), ==, 0);
+    return MUNIT_OK;
+}
+
+/* A follower remains in probe mode until the leader receives a successful
+ * AppendEntries response. */
+TEST(replication, sendProbe, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply req1;
+    struct raft_apply req2;
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+
+    /* Server 0 becomes leader and sends the initial heartbeat. */
+    CLUSTER_STEP_N(25);
+    ASSERT_LEADER(0);
+    ASSERT_TIME(1030);
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1);
+
+    /* Set a very high network latency for server 1, so server 0 will send a
+     * second probe AppendEntries without transitioning to pipeline mode. */
+    munit_assert_int(CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES), ==, 0);
+    CLUSTER_SET_NETWORK_LATENCY(1, 250);
+
+    /* Server 0 receives a new entry after 15 milliseconds. Since the follower
+     * is still in probe mode and since an AppendEntries message was already
+     * sent recently, it does not send the new entry immediately. */
+    CLUSTER_STEP_UNTIL_ELAPSED(15);
+    CLUSTER_APPLY_ADD_X(0, &req1, 1, NULL);
+    CLUSTER_STEP;
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1);
+
+    /* A heartbeat timeout elapses without receiving a response, so server 0
+     * sends an new AppendEntries to server 1. */
+    CLUSTER_STEP_UNTIL_ELAPSED(85);
+    CLUSTER_STEP;
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2);
+
+    /* Server 0 receives a second entry after 15 milliseconds. Since the
+     * follower is still in probe mode and since an AppendEntries message was
+     * already sent recently, it does not send the new entry immediately. */
+    CLUSTER_STEP_UNTIL_ELAPSED(15);
+    CLUSTER_APPLY_ADD_X(0, &req2, 1, NULL);
+    CLUSTER_STEP;
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2);
+
+    /* Eventually server 0 receives AppendEntries results for both entries. */
+    CLUSTER_STEP_UNTIL_APPLIED(0, 4, 1000);
+
+    return MUNIT_OK;
+}
+
+static bool indices_updated(struct raft_fixture *f, void *data)
+{
+    (void)f;
+    const struct raft *r = data;
+    return r->last_stored == 4 && r->leader_state.progress[1].match_index == 3;
+}
+
+/* A follower transitions to pipeline mode after the leader receives a
+ * successful AppendEntries response from it. */
+TEST(replication, sendPipeline, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft;
+    struct raft_apply req1;
+    struct raft_apply req2;
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+
+    raft = CLUSTER_RAFT(0);
+
+    /* Server 0 becomes leader and sends the initial heartbeat, receiving a
+     * successful response. */
+    CLUSTER_STEP_UNTIL_ELAPSED(1070);
+    ASSERT_LEADER(0);
+    ASSERT_TIME(1070);
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1);
+
+    /* Server 0 receives a new entry after 15 milliseconds. Since the follower
+     * has transitioned to pipeline mode the new entry is sent immediately and
+     * the next index is optimistically increased. */
+    CLUSTER_STEP_UNTIL_ELAPSED(15);
+    CLUSTER_APPLY_ADD_X(0, &req1, 1, NULL);
+    CLUSTER_STEP;
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2);
+    munit_assert_int(raft->leader_state.progress[1].next_index, ==, 4);
+
+    /* After another 15 milliseconds server 0 receives a second apply request,
+     * which is also sent out immediately */
+    CLUSTER_STEP_UNTIL_ELAPSED(15);
+    CLUSTER_APPLY_ADD_X(0, &req2, 1, NULL);
+    CLUSTER_STEP;
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 3);
+    munit_assert_int(raft->leader_state.progress[1].next_index, ==, 5);
+
+    /* Wait until the leader has stored entry 4 and the follower has matched
+     * entry 3. Expect the commit index to have been updated to 3. */
+    CLUSTER_STEP_UNTIL(indices_updated, CLUSTER_RAFT(0), 2000);
+    munit_assert_ulong(raft->commit_index, ==, 3);
+
+    /* Eventually server 0 receives AppendEntries results for both entries. */
+    CLUSTER_STEP_UNTIL_APPLIED(0, 4, 1000);
+
+    return MUNIT_OK;
+}
+
+/* A follower disconnects while in probe mode. */
+TEST(replication, sendDisconnect, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+
+    /* Server 0 becomes leader and sends the initial heartbeat, however they
+     * fail because server 1 has disconnected. */
+    CLUSTER_STEP_N(24);
+    ASSERT_LEADER(0);
+    CLUSTER_DISCONNECT(0, 1);
+    CLUSTER_STEP;
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 0);
+
+    /* After the heartbeat timeout server 0 retries, but still fails. */
+    CLUSTER_STEP_UNTIL_ELAPSED(100);
+    CLUSTER_STEP;
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 0);
+
+    /* After another heartbeat timeout server 0 retries and this time
+     * succeeds. */
+    CLUSTER_STEP_UNTIL_ELAPSED(100);
+    CLUSTER_RECONNECT(0, 1);
+    CLUSTER_STEP;
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 1);
+
+    return MUNIT_OK;
+}
+
+/* A follower disconnects while in pipeline mode. */
+TEST(replication, sendDisconnectPipeline, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply req1;
+    struct raft_apply req2;
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+
+    /* Server 0 becomes leader and sends a couple of heartbeats. */
+    CLUSTER_STEP_UNTIL_ELAPSED(1215);
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 2);
+
+    /* It then starts to replicate a few entries, however the follower
+     * disconnects before delivering results. */
+    CLUSTER_APPLY_ADD_X(0, &req1, 1, NULL);
+    CLUSTER_STEP;
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 3);
+    CLUSTER_APPLY_ADD_X(0, &req2, 1, NULL);
+    CLUSTER_STEP;
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 4);
+
+    CLUSTER_DISCONNECT(0, 1);
+
+    /* The next heartbeat fails, transitioning the follower back to probe
+     * mode. */
+    CLUSTER_STEP_UNTIL_ELAPSED(115);
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_APPEND_ENTRIES), ==, 4);
+
+    /* After reconnection the follower eventually replicates the entries and
+     * reports back. */
+    CLUSTER_RECONNECT(0, 1);
+
+    CLUSTER_STEP_UNTIL_APPLIED(0, 3, 1000);
+
+    return MUNIT_OK;
+}
+
+static char *send_oom_heap_fault_delay[] = {"5", NULL};
+static char *send_oom_heap_fault_repeat[] = {"1", NULL};
+
+static MunitParameterEnum send_oom_params[] = {
+    {TEST_HEAP_FAULT_DELAY, send_oom_heap_fault_delay},
+    {TEST_HEAP_FAULT_REPEAT, send_oom_heap_fault_repeat},
+    {NULL, NULL},
+};
+
+/* Out of memory failures. */
+TEST(replication, sendOom, setUp, tearDown, 0, send_oom_params)
+{
+    struct fixture *f = data;
+    return MUNIT_SKIP;
+    struct raft_apply req;
+    BOOTSTRAP_START_AND_ELECT;
+
+    HEAP_FAULT_ENABLE;
+
+    CLUSTER_APPLY_ADD_X(0, &req, 1, NULL);
+    CLUSTER_STEP;
+
+    return MUNIT_OK;
+}
+
+/* A failure occurs upon submitting the I/O request. */
+TEST(replication, persistError, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply req;
+    BOOTSTRAP_START_AND_ELECT;
+
+    raft_fixture_append_fault(&f->cluster, 0, 0);
+
+    CLUSTER_APPLY_ADD_X(0, &req, 1, NULL);
+    CLUSTER_STEP;
+
+    return MUNIT_OK;
+}
+
+/* Receive the same entry a second time, before the first has been persisted. */
+TEST(replication, recvTwice, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply *req = munit_malloc(sizeof *req);
+    BOOTSTRAP_START_AND_ELECT;
+
+    CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req, 1, NULL);
+
+    /* Set a high disk latency for server 1, so server 0 won't receive an
+     * AppendEntries result within the heartbeat and will re-send the same
+     * entries */
+    CLUSTER_SET_DISK_LATENCY(1, 300);
+
+    CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100); /* First AppendEntries */
+    CLUSTER_STEP_UNTIL_ELAPSED(110);         /* Heartbeat timeout */
+    CLUSTER_STEP_UNTIL_DELIVERED(0, 1, 100); /* Second AppendEntries */
+
+    CLUSTER_STEP_UNTIL_APPLIED(0, req->index, 500);
+
+    free(req);
+
+    return MUNIT_OK;
+}
+
+/* If the term in the request is stale, the server rejects it. */
+TEST(replication, recvStaleTerm, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_GROW;
+    BOOTSTRAP_START_AND_ELECT;
+
+    /* Set a very high election timeout and the disconnect the leader so it will
+     * keep sending heartbeats. */
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 5000);
+    raft_set_election_timeout(CLUSTER_RAFT(0), 5000);
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+
+    /* Eventually a new leader gets elected. */
+    CLUSTER_STEP_UNTIL_HAS_NO_LEADER(5000);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(10000);
+    munit_assert_int(CLUSTER_LEADER, ==, 1);
+
+    /* Reconnect the old leader to the current follower. */
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+
+    /* Step a few times, so the old leader sends heartbeats to the follower,
+     * which rejects them. */
+    CLUSTER_STEP_UNTIL_ELAPSED(200);
+
+    return MUNIT_OK;
+}
+
+/* If server's log is shorter than prevLogIndex, the request is rejected . */
+TEST(replication, recvMissingEntries, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry;
+    CLUSTER_BOOTSTRAP;
+
+    /* Server 0 has an entry that server 1 doesn't have */
+    entry.type = RAFT_COMMAND;
+    entry.term = 1;
+    FsmEncodeSetX(1, &entry.buf);
+    CLUSTER_ADD_ENTRY(0, &entry);
+
+    /* Server 0 wins the election because it has a longer log. */
+    CLUSTER_START;
+    CLUSTER_STEP_UNTIL_HAS_LEADER(5000);
+    munit_assert_int(CLUSTER_LEADER, ==, 0);
+
+    /* The first server replicates missing entries to the second. */
+    CLUSTER_STEP_UNTIL_APPLIED(1, 3, 3000);
+
+    return MUNIT_OK;
+}
+
+/* If the term of the last log entry on the server is different from the one
+ * prevLogTerm, and value of prevLogIndex is greater than server's commit commit
+ * index (i.e. this is a normal inconsistency), we reject the request. */
+TEST(replication, recvPrevLogTermMismatch, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry1;
+    struct raft_entry entry2;
+    CLUSTER_BOOTSTRAP;
+
+    /* The servers have an entry with a conflicting term. */
+    entry1.type = RAFT_COMMAND;
+    entry1.term = 2;
+    FsmEncodeSetX(1, &entry1.buf);
+    CLUSTER_ADD_ENTRY(0, &entry1);
+
+    entry2.type = RAFT_COMMAND;
+    entry2.term = 1;
+    FsmEncodeSetX(2, &entry2.buf);
+    CLUSTER_ADD_ENTRY(1, &entry2);
+
+    CLUSTER_START;
+    CLUSTER_ELECT(0);
+
+    /* The follower eventually replicates the entry */
+    CLUSTER_STEP_UNTIL_APPLIED(1, 2, 3000);
+
+    return MUNIT_OK;
+}
+
+/* The follower has an uncommitted log entry that conflicts with a new one sent
+ * by the leader (same index but different term). The follower's conflicting log
+ * entry happens to be a configuration change. In that case the follower
+ * discards the conflicting entry from its log and rolls back its configuration
+ * to the initial one contained in the log entry at index 1. */
+TEST(replication, recvRollbackConfigurationToInitial, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry1;
+    struct raft_entry entry2;
+    struct raft_configuration base; /* Committed configuration at index 1 */
+    struct raft_configuration conf; /* Uncommitted configuration at index 2 */
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_CONFIGURATION(&base);
+
+    /* Both servers have an entry at index 2, but with conflicting terms. The
+     * entry of the second server is a configuration change. */
+    entry1.type = RAFT_COMMAND;
+    entry1.term = 2;
+    FsmEncodeSetX(1, &entry1.buf);
+    CLUSTER_ADD_ENTRY(0, &entry1);
+
+    entry2.type = RAFT_CHANGE;
+    entry2.term = 1;
+    CLUSTER_CONFIGURATION(&conf);
+    raft_configuration_add(&conf, 3, "3", 2);
+    raft_configuration_encode(&conf, &entry2.buf);
+    CLUSTER_ADD_ENTRY(1, &entry2);
+
+    /* At startup the second server uses the most recent configuration, i.e. the
+     * one contained in the entry that we just added. The server can't know yet
+     * if it's committed or not, and regards it as pending configuration
+     * change. */
+    CLUSTER_START;
+    ASSERT_CONFIGURATION(1, &conf);
+
+    /* The first server gets elected. */
+    CLUSTER_ELECT(0);
+
+    /* The second server eventually replicates the first server's log entry at
+     * index 2, truncating its own log and rolling back to the configuration
+     * contained in the log entry at index 1. */
+    CLUSTER_STEP_UNTIL_APPLIED(1, 2, 3000);
+    ASSERT_CONFIGURATION(0, &base);
+    ASSERT_CONFIGURATION(1, &base);
+
+    raft_configuration_close(&base);
+    raft_configuration_close(&conf);
+
+    return MUNIT_OK;
+}
+
+/* The follower has an uncommitted log entry that conflicts with a new one sent
+ * by the leader (same index but different term). The follower's conflicting log
+ * entry happens to be a configuration change. There's also an older committed
+ * configuration entry present. In that case the follower discards the
+ * conflicting entry from its log and rolls back its configuration to the
+ * committed one in the older configuration entry. */
+TEST(replication, recvRollbackConfigurationToPrevious, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry1;
+    struct raft_entry entry2;
+    struct raft_entry entry3;
+    struct raft_entry entry4;
+    struct raft_configuration base; /* Committed configuration at index 2 */
+    struct raft_configuration conf; /* Uncommitted configuration at index 3 */
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_CONFIGURATION(&base);
+
+    /* Both servers have a matching configuration entry at index 2. */
+    CLUSTER_CONFIGURATION(&conf);
+
+    entry1.type = RAFT_CHANGE;
+    entry1.term = 1;
+    raft_configuration_encode(&conf, &entry1.buf);
+    CLUSTER_ADD_ENTRY(0, &entry1);
+
+    entry2.type = RAFT_CHANGE;
+    entry2.term = 1;
+    raft_configuration_encode(&conf, &entry2.buf);
+    CLUSTER_ADD_ENTRY(1, &entry2);
+
+    /* Both servers have an entry at index 3, but with conflicting terms. The
+     * entry of the second server is a configuration change. */
+    entry3.type = RAFT_COMMAND;
+    entry3.term = 2;
+    FsmEncodeSetX(1, &entry3.buf);
+    CLUSTER_ADD_ENTRY(0, &entry3);
+
+    entry4.type = RAFT_CHANGE;
+    entry4.term = 1;
+    raft_configuration_add(&conf, 3, "3", 2);
+    raft_configuration_encode(&conf, &entry4.buf);
+    CLUSTER_ADD_ENTRY(1, &entry4);
+
+    /* At startup the second server uses the most recent configuration, i.e. the
+     * one contained in the log entry at index 3. The server can't know yet if
+     * it's committed or not, and regards it as pending configuration change. */
+    CLUSTER_START;
+    ASSERT_CONFIGURATION(1, &conf);
+
+    /* The first server gets elected. */
+    CLUSTER_ELECT(0);
+
+    /* The second server eventually replicates the first server's log entry at
+     * index 3, truncating its own log and rolling back to the configuration
+     * contained in the log entry at index 2. */
+    CLUSTER_STEP_UNTIL_APPLIED(1, 3, 3000);
+    ASSERT_CONFIGURATION(0, &base);
+    ASSERT_CONFIGURATION(1, &base);
+
+    raft_configuration_close(&base);
+    raft_configuration_close(&conf);
+
+    return MUNIT_OK;
+}
+
+/* The follower has an uncommitted log entry that conflicts with a new one sent
+ * by the leader (same index but different term). The follower's conflicting log
+ * entry happens to be a configuration change. The follower's log has been
+ * truncated after a snashot and does not contain the previous committed
+ * configuration anymore. In that case the follower discards the conflicting
+ * entry from its log and rolls back its configuration to the previous committed
+ * one, which was cached when the snapshot was restored. */
+TEST(replication, recvRollbackConfigurationToSnapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry1;
+    struct raft_entry entry2;
+    struct raft_configuration base; /* Committed configuration at index 1 */
+    struct raft_configuration conf; /* Uncommitted configuration at index 2 */
+    int rv;
+
+    CLUSTER_CONFIGURATION(&conf);
+    CLUSTER_CONFIGURATION(&base);
+
+    /* Bootstrap the first server. This creates a log entry at index 1
+     * containing the initial configuration. */
+    rv = raft_bootstrap(CLUSTER_RAFT(0), &conf);
+    munit_assert_int(rv, ==, 0);
+
+    /* The second server has a snapshot up to entry 1. Entry 1 is not present in
+     * the log. */
+    CLUSTER_SET_SNAPSHOT(1 /*                                               */,
+                         1 /* last index                                    */,
+                         1 /* last term                                     */,
+                         1 /* conf index                                    */,
+                         5 /* x                                             */,
+                         0 /* y                                             */);
+    CLUSTER_SET_TERM(1, 1);
+
+    /* Both servers have an entry at index 2, but with conflicting terms. The
+     * entry of the second server is a configuration change and gets appended to
+     * the truncated log. */
+    entry1.type = RAFT_COMMAND;
+    entry1.term = 3;
+    FsmEncodeSetX(1, &entry1.buf);
+    CLUSTER_ADD_ENTRY(0, &entry1);
+
+    entry2.type = RAFT_CHANGE;
+    entry2.term = 2;
+    raft_configuration_add(&conf, 3, "3", 2);
+    raft_configuration_encode(&conf, &entry2.buf);
+    CLUSTER_ADD_ENTRY(1, &entry2);
+
+    /* At startup the second server uses the most recent configuration, i.e. the
+     * one contained in the log entry at index 2. The server can't know yet if
+     * it's committed or not, and regards it as pending configuration change. */
+    CLUSTER_START;
+    ASSERT_CONFIGURATION(1, &conf);
+
+    CLUSTER_ELECT(0);
+
+    /* The second server eventually replicates the first server's log entry at
+     * index 3, truncating its own log and rolling back to the configuration
+     * contained in the snapshot, which is not present in the log anymore but
+     * was cached at startup. */
+    CLUSTER_STEP_UNTIL_APPLIED(1, 3, 3000);
+    ASSERT_CONFIGURATION(0, &base);
+    ASSERT_CONFIGURATION(1, &base);
+
+    raft_configuration_close(&base);
+    raft_configuration_close(&conf);
+
+    return MUNIT_OK;
+}
+
+/* If any of the new entry has the same index of an existing entry in our log,
+ * but different term, and that entry index is already committed, we bail out
+ * with an error. */
+TEST(replication, recvPrevIndexConflict, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry1;
+    struct raft_entry entry2;
+    CLUSTER_BOOTSTRAP;
+
+    /* The servers have an entry with a conflicting term. */
+    entry1.type = RAFT_COMMAND;
+    entry1.term = 2;
+    FsmEncodeSetX(1, &entry1.buf);
+    CLUSTER_ADD_ENTRY(0, &entry1);
+
+    entry2.type = RAFT_COMMAND;
+    entry2.term = 1;
+    FsmEncodeSetX(2, &entry2.buf);
+    CLUSTER_ADD_ENTRY(1, &entry2);
+
+    CLUSTER_START;
+    CLUSTER_ELECT(0);
+
+    /* Artificially bump the commit index on the second server */
+    CLUSTER_RAFT(1)->commit_index = 2;
+    CLUSTER_STEP;
+    CLUSTER_STEP;
+
+    return MUNIT_OK;
+}
+
+/* A write log request is submitted for outstanding log entries. If some entries
+ * are already existing in the log, they will be skipped. */
+TEST(replication, recvSkip, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply *req = munit_malloc(sizeof *req);
+    BOOTSTRAP_START_AND_ELECT;
+
+    /* Submit an entry */
+    CLUSTER_APPLY_ADD_X(0, req, 1, NULL);
+
+    /* The leader replicates the entry to the follower however it does not get
+     * notified about the result, so it sends the entry again. */
+    CLUSTER_STEP;
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+    CLUSTER_STEP_UNTIL_ELAPSED(150);
+
+    /* The follower reconnects and receives again the same entry. This time the
+     * leader receives the notification. */
+    CLUSTER_DESATURATE_BOTHWAYS(0, 1);
+    CLUSTER_STEP_UNTIL_APPLIED(0, req->index, 2000);
+
+    free(req);
+
+    return MUNIT_OK;
+}
+
+/* If the index and term of the last snapshot on the server match prevLogIndex
+ * and prevLogTerm the request is accepted. */
+TEST(replication, recvMatch_last_snapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry;
+    struct raft_configuration configuration;
+    int rv;
+
+    CLUSTER_CONFIGURATION(&configuration);
+    rv = raft_bootstrap(CLUSTER_RAFT(0), &configuration);
+    munit_assert_int(rv, ==, 0);
+    raft_configuration_close(&configuration);
+
+    /* The first server has entry 2 */
+    entry.type = RAFT_COMMAND;
+    entry.term = 2;
+    FsmEncodeSetX(5, &entry.buf);
+    CLUSTER_ADD_ENTRY(0, &entry);
+
+    /* The second server has a snapshot up to entry 2 */
+    CLUSTER_SET_SNAPSHOT(1 /*                                               */,
+                         2 /* last index                                    */,
+                         2 /* last term                                     */,
+                         1 /* conf index                                    */,
+                         5 /* x                                             */,
+                         0 /* y                                             */);
+    CLUSTER_SET_TERM(1, 2);
+
+    CLUSTER_START;
+    CLUSTER_ELECT(0);
+
+    /* Apply an additional entry and check that it gets replicated on the
+     * follower. */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_STEP_UNTIL_APPLIED(1, 3, 3000);
+
+    return MUNIT_OK;
+}
+
+/* If a candidate server receives a request containing the same term as its
+ * own, it it steps down to follower and accept the request . */
+TEST(replication, recvCandidateSameTerm, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_GROW;
+    CLUSTER_BOOTSTRAP;
+
+    /* Disconnect server 2 from the other two and set a low election timeout on
+     * it, so it will immediately start an election. */
+    CLUSTER_SATURATE_BOTHWAYS(2, 0);
+    CLUSTER_SATURATE_BOTHWAYS(2, 1);
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 2, 800);
+    raft_set_election_timeout(CLUSTER_RAFT(2), 800);
+
+    /* Server 2 becomes candidate. */
+    CLUSTER_START;
+    CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_CANDIDATE, 1000);
+    munit_assert_int(CLUSTER_TERM(2), ==, 2);
+
+    /* Server 0 wins the election and replicates an entry. */
+    CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_LEADER, 2000);
+    munit_assert_int(CLUSTER_TERM(0), ==, 2);
+    munit_assert_int(CLUSTER_TERM(1), ==, 2);
+    munit_assert_int(CLUSTER_TERM(2), ==, 2);
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Now reconnect the third server, which eventually steps down and
+     * replicates the entry. */
+    munit_assert_int(CLUSTER_STATE(2), ==, RAFT_CANDIDATE);
+    munit_assert_int(CLUSTER_TERM(2), ==, 2);
+    CLUSTER_DESATURATE_BOTHWAYS(2, 0);
+    CLUSTER_DESATURATE_BOTHWAYS(2, 1);
+    CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_FOLLOWER, 2000);
+    CLUSTER_STEP_UNTIL_APPLIED(2, 2, 2000);
+
+    return MUNIT_OK;
+}
+
+/* If a candidate server receives a request containing an higher term as its
+ * own, it it steps down to follower and accept the request . */
+TEST(replication, recvCandidateHigherTerm, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_GROW;
+    CLUSTER_BOOTSTRAP;
+
+    /* Set a high election timeout on server 1, so it won't become candidate */
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 1, 2000);
+    raft_set_election_timeout(CLUSTER_RAFT(1), 2000);
+
+    /* Disconnect server 2 from the other two. */
+    CLUSTER_SATURATE_BOTHWAYS(2, 0);
+    CLUSTER_SATURATE_BOTHWAYS(2, 1);
+
+    /* Set a low election timeout on server 0, and disconnect it from server 1,
+     * so by the time it wins the second round, server 2 will have turned
+     * candidate */
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 800);
+    raft_set_election_timeout(CLUSTER_RAFT(0), 800);
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+
+    CLUSTER_START;
+
+    /* Server 2 becomes candidate, and server 0 already is candidate. */
+    CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_CANDIDATE, 1500);
+    munit_assert_int(CLUSTER_TERM(2), ==, 2);
+    munit_assert_int(CLUSTER_STATE(0), ==, RAFT_CANDIDATE);
+    munit_assert_int(CLUSTER_TERM(0), ==, 2);
+
+    /* Server 0 starts a new election, while server 2 is still candidate */
+    CLUSTER_STEP_UNTIL_TERM_IS(0, 3, 2000);
+    munit_assert_int(CLUSTER_TERM(2), ==, 2);
+    munit_assert_int(CLUSTER_STATE(2), ==, RAFT_CANDIDATE);
+
+    /* Reconnect the first and second server and let the election succeed and
+     * replicate an entry. */
+    CLUSTER_DESATURATE_BOTHWAYS(0, 1);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(1000);
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Now reconnect the third server, which eventually steps down and
+     * replicates the entry. */
+    munit_assert_int(CLUSTER_STATE(2), ==, RAFT_CANDIDATE);
+    munit_assert_int(CLUSTER_TERM(2), ==, 2);
+    CLUSTER_DESATURATE_BOTHWAYS(2, 0);
+    CLUSTER_DESATURATE_BOTHWAYS(2, 1);
+    CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_FOLLOWER, 2000);
+    CLUSTER_STEP_UNTIL_APPLIED(2, 2, 2000);
+
+    return MUNIT_OK;
+}
+
+/* If the server handling the response is not the leader, the result
+ * is ignored. */
+TEST(replication, resultNotLeader, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    BOOTSTRAP_START_AND_ELECT;
+
+    /* Set a very high-latency for the second server's outgoing messages, so the
+     * first server won't get notified about the results for a while. */
+    CLUSTER_SET_NETWORK_LATENCY(1, 400);
+
+    /* Set a low election timeout on the first server so it will step down very
+     * soon. */
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 200);
+    raft_set_election_timeout(CLUSTER_RAFT(0), 200);
+
+    /* Eventually leader steps down and becomes candidate. */
+    CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_CANDIDATE, 2000);
+
+    /* The AppendEntries result eventually gets delivered, but the candidate
+     * ignores it. */
+    CLUSTER_STEP_UNTIL_ELAPSED(400);
+
+    return MUNIT_OK;
+}
+
+/* If the response has a term which is lower than the server's one, it's
+ * ignored. */
+TEST(replication, resultLowerTerm, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_GROW;
+    BOOTSTRAP_START_AND_ELECT;
+
+    /* Set a very high-latency for the second server's outgoing messages, so the
+     * first server won't get notified about the results for a while. */
+    CLUSTER_SET_NETWORK_LATENCY(1, 2000);
+
+    /* Set a high election timeout on server 1, so it won't become candidate */
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 1, 2000);
+    raft_set_election_timeout(CLUSTER_RAFT(1), 2000);
+
+    /* Disconnect server 0 and set a low election timeout on it so it will step
+     * down very soon. */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 200);
+    raft_set_election_timeout(CLUSTER_RAFT(0), 200);
+    CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 2000);
+
+    /* Make server 0 become leader again. */
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+    CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_LEADER, 4000);
+
+    /* Eventually deliver the result message. */
+    CLUSTER_STEP_UNTIL_ELAPSED(2500);
+
+    return MUNIT_OK;
+}
+
+/* If the response has a term which is higher than the server's one, step down
+ * to follower. */
+TEST(replication, resultHigherTerm, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_GROW;
+    BOOTSTRAP_START_AND_ELECT;
+
+    /* Set a very high election timeout for server 0 so it won't step down. */
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 5000);
+    raft_set_election_timeout(CLUSTER_RAFT(0), 5000);
+
+    /* Disconnect the server 0 from the rest of the cluster. */
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+
+    /* Eventually a new leader gets elected */
+    CLUSTER_STEP_UNTIL_HAS_NO_LEADER(2000);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(4000);
+    munit_assert_int(CLUSTER_LEADER, ==, 1);
+
+    /* Reconnect the old leader to the current follower, which eventually
+     * replies with an AppendEntries result containing an higher term. */
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+    CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 2000);
+
+    return MUNIT_OK;
+}
+
+/* If the response fails because a log mismatch, the nextIndex for the server is
+ * updated and the relevant older entries are resent. */
+TEST(replication, resultRetry, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry;
+    CLUSTER_BOOTSTRAP;
+
+    /* Add an additional entry to the first server that the second server does
+     * not have. */
+    entry.type = RAFT_COMMAND;
+    entry.term = 1;
+    FsmEncodeSetX(5, &entry.buf);
+    CLUSTER_ADD_ENTRY(0, &entry);
+
+    CLUSTER_START;
+    CLUSTER_ELECT(0);
+
+    /* The first server receives an AppendEntries result from the second server
+     * indicating that its log does not have the entry at index 2, so it will
+     * resend it. */
+    CLUSTER_STEP_UNTIL_APPLIED(1, 3, 2000);
+
+    return MUNIT_OK;
+}
+
+static void applyAssertStatusCb(struct raft_apply *req,
+                                int status,
+                                void *result)
+{
+    (void)result;
+    int status_expected = (int)(intptr_t)(req->data);
+    munit_assert_int(status_expected, ==, status);
+}
+
+/* When the leader fails to write some new entries to disk, it steps down. */
+TEST(replication, diskWriteFailure, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply *req = munit_malloc(sizeof(*req));
+    req->data = (void *)(intptr_t)RAFT_IOERR;
+    BOOTSTRAP_START_AND_ELECT;
+
+    raft_fixture_append_fault(&f->cluster, 0, 0);
+    CLUSTER_APPLY_ADD_X(0, req, 1, applyAssertStatusCb);
+    /* The leader steps down when its disk write fails. */
+    CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 2000);
+    free(req);
+
+    return MUNIT_OK;
+}
+
+/* A follower updates its term number while persisting entries. */
+TEST(replication, newTermWhileAppending, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply *req = munit_malloc(sizeof(*req));
+    raft_term term;
+    CLUSTER_GROW;
+
+    /* Make sure that persisting entries will take a long time */
+    CLUSTER_SET_DISK_LATENCY(2, 3000);
+
+    BOOTSTRAP_START_AND_ELECT;
+    CLUSTER_APPLY_ADD_X(0, req, 1, NULL);
+
+    /* Wait for the leader to replicate the entry */
+    CLUSTER_STEP_UNTIL_ELAPSED(500);
+
+    /* Force a new term */
+    term = CLUSTER_RAFT(2)->current_term;
+    CLUSTER_DEPOSE;
+    CLUSTER_ELECT(1);
+
+    CLUSTER_STEP_UNTIL_ELAPSED(500);
+    munit_assert_ullong(CLUSTER_RAFT(2)->current_term, ==, term + 1);
+
+    /* Wait for the long disk write to complete */
+    CLUSTER_STEP_UNTIL_ELAPSED(3000);
+
+    free(req);
+
+    return MUNIT_OK;
+}
+
+/* A leader with slow disk commits an entry that it hasn't persisted yet,
+ * because enough followers to have a majority have aknowledged that they have
+ * appended the entry. The leader's last_stored field hence lags behind its
+ * commit_index. A new leader gets elected, with a higher commit index and sends
+ * first a new entry than a heartbeat to the old leader, that needs to update
+ * its commit_index taking into account its lagging last_stored. */
+TEST(replication, lastStoredLaggingBehindCommitIndex, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_GROW;
+
+    /* Server 0 takes a long time to persist entry 2 (the barrier) */
+    CLUSTER_SET_DISK_LATENCY(0, 10000);
+
+    /* Server 0 gets elected and creates a barrier entry at index 2 */
+    BOOTSTRAP_START_AND_ELECT;
+
+    /* Server 0 commits and applies barrier entry 2 even if it not persist it
+     * yet. */
+    CLUSTER_STEP_UNTIL_APPLIED(0, 2, 2000);
+
+    munit_assert_int(CLUSTER_RAFT(0)->last_stored, ==, 1);
+    munit_assert_int(CLUSTER_RAFT(0)->commit_index, ==, 2);
+    munit_assert_int(CLUSTER_RAFT(0)->last_applied, ==, 2);
+
+    /* Server 1 stored barrier entry 2, but did not yet receive a notification
+     * from server 0 about the new commit index. */
+    munit_assert_int(CLUSTER_RAFT(1)->last_stored, ==, 2);
+    munit_assert_int(CLUSTER_RAFT(1)->commit_index, ==, 1);
+    munit_assert_int(CLUSTER_RAFT(1)->last_applied, ==, 1);
+
+    /* Disconnect server 0 from server 1 and 2. */
+    CLUSTER_DISCONNECT(0, 1);
+    CLUSTER_DISCONNECT(0, 2);
+
+    /* Set a very high election timeout on server 0, so it won't step down for a
+     * while, even if disconnected. */
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 10000);
+    raft_set_election_timeout(CLUSTER_RAFT(0), 10000);
+
+    /* Server 1 and 2 eventually timeout and start an election, server 1
+     * wins. */
+    CLUSTER_STEP_UNTIL_HAS_NO_LEADER(4000);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(2000);
+    munit_assert_int(CLUSTER_LEADER, ==, 1);
+
+    /* Server 1 commits the barrier entry at index 3 that it created at the
+     * start of its term. */
+    CLUSTER_STEP_UNTIL_APPLIED(1, 3, 2000);
+
+    /* Reconnect server 0 to server 1, which will start replicating entry 3 to
+     * it. */
+    CLUSTER_RECONNECT(0, 1);
+    CLUSTER_STEP_UNTIL_APPLIED(0, 3, 20000);
+
+    return MUNIT_OK;
+}
+
+/* A leader with faulty disk fails to persist the barrier entry upon election.
+ */
+TEST(replication, failPersistBarrier, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_GROW;
+
+    /* Server 0 will fail to persist entry 2, a barrier */
+    raft_fixture_append_fault(&f->cluster, 0, 0);
+
+    /* Server 0 gets elected and creates a barrier entry at index 2 */
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+    CLUSTER_START_ELECT(0);
+
+    /* Cluster recovers. */
+    CLUSTER_STEP_UNTIL_HAS_LEADER(20000);
+
+    return MUNIT_OK;
+}
+
+/* All servers fail to persist the barrier entry upon election of the first
+ * leader. Ensure the cluster is able to make progress afterwards.
+ */
+TEST(replication, failPersistBarrierFollower, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_GROW;
+
+    /* The servers will fail to persist entry 2, a barrier */
+    raft_fixture_append_fault(&f->cluster, 1, 0);
+    raft_fixture_append_fault(&f->cluster, 2, 0);
+
+    /* Server 0 gets elected and creates a barrier entry at index 2 */
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+    CLUSTER_START_ELECT(0);
+
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    return MUNIT_OK;
+}
+
+/* A leader originates a log entry, fails to persist it, and steps down.
+ * A follower that received the entry wins the ensuing election and sends
+ * the same entry back to the original leader, while the original leader
+ * still has an outgoing pending message that references its copy of the
+ * entry. This triggers the original leader to reinstate the entry in its
+ * log. */
+TEST(replication, receiveSameWithPendingSend, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply req;
+
+    /* Three voters. */
+    CLUSTER_GROW;
+    /* Server 0 is the leader. */
+    BOOTSTRAP_START_AND_ELECT;
+
+    /* Server 1 never gets the entry. */
+    raft_fixture_set_send_latency(&f->cluster, 0, 1, 10000);
+
+    /* Disk write fails, but not before the entry gets to server 2. */
+    CLUSTER_SET_DISK_LATENCY(0, 1000);
+    raft_fixture_append_fault(&f->cluster, 0, 0);
+    req.data = (void *)(intptr_t)RAFT_IOERR;
+    CLUSTER_APPLY_ADD_X(0, &req, 1, NULL);
+    /* Server 0 steps down. */
+    CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 1500);
+    munit_assert_ullong(CLUSTER_RAFT(0)->current_term, ==, 2);
+    ASSERT_FOLLOWER(1);
+    ASSERT_FOLLOWER(2);
+    /* Only server 2 has the new entry. */
+    munit_assert_ullong(CLUSTER_RAFT(0)->last_stored, ==, 2);
+    munit_assert_ullong(CLUSTER_RAFT(1)->last_stored, ==, 2);
+    munit_assert_ullong(CLUSTER_RAFT(2)->last_stored, ==, 3);
+
+    /* Server 2 times out first and wins the election. */
+    raft_set_election_timeout(CLUSTER_RAFT(2), 500);
+    raft_fixture_start_elect(&f->cluster, 2);
+    CLUSTER_STEP_UNTIL_STATE_IS(2, RAFT_LEADER, 1000);
+    munit_assert_ullong(CLUSTER_RAFT(2)->current_term, ==, 3);
+
+    /* Server 0 gets the same entry back from server 2. */
+    CLUSTER_STEP_UNTIL_APPLIED(2, 3, 1000);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_snapshot.c b/test/raft/integration/test_snapshot.c
new file mode 100644
index 000000000..e75d27ba5
--- /dev/null
+++ b/test/raft/integration/test_snapshot.c
@@ -0,0 +1,860 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(3);
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+    CLUSTER_ELECT(0);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+/* Set the snapshot threshold on all servers of the cluster */
+#define SET_SNAPSHOT_THRESHOLD(VALUE)                            \
+    {                                                            \
+        unsigned i;                                              \
+        for (i = 0; i < CLUSTER_N; i++) {                        \
+            raft_set_snapshot_threshold(CLUSTER_RAFT(i), VALUE); \
+        }                                                        \
+    }
+
+/* Set the snapshot trailing logs number on all servers of the cluster */
+#define SET_SNAPSHOT_TRAILING(VALUE)                            \
+    {                                                           \
+        unsigned i;                                             \
+        for (i = 0; i < CLUSTER_N; i++) {                       \
+            raft_set_snapshot_trailing(CLUSTER_RAFT(i), VALUE); \
+        }                                                       \
+    }
+
+/* Set the snapshot timeout on all servers of the cluster */
+#define SET_SNAPSHOT_TIMEOUT(VALUE)                                    \
+    {                                                                  \
+        unsigned i;                                                    \
+        for (i = 0; i < CLUSTER_N; i++) {                              \
+            raft_set_install_snapshot_timeout(CLUSTER_RAFT(i), VALUE); \
+        }                                                              \
+    }
+
+static int ioMethodSnapshotPutFail(struct raft_io *raft_io,
+                                   unsigned trailing,
+                                   struct raft_io_snapshot_put *req,
+                                   const struct raft_snapshot *snapshot,
+                                   raft_io_snapshot_put_cb cb)
+{
+    (void)raft_io;
+    (void)trailing;
+    (void)req;
+    (void)snapshot;
+    (void)cb;
+    return -1;
+}
+
+#define SET_FAULTY_SNAPSHOT_PUT()                                        \
+    {                                                                    \
+        unsigned i;                                                      \
+        for (i = 0; i < CLUSTER_N; i++) {                                \
+            CLUSTER_RAFT(i)->io->snapshot_put = ioMethodSnapshotPutFail; \
+        }                                                                \
+    }
+
+static int ioMethodAsyncWorkFail(struct raft_io *raft_io,
+                                 struct raft_io_async_work *req,
+                                 raft_io_async_work_cb cb)
+{
+    (void)raft_io;
+    (void)req;
+    (void)cb;
+    return -1;
+}
+
+#define SET_FAULTY_ASYNC_WORK()                                      \
+    {                                                                \
+        unsigned i;                                                  \
+        for (i = 0; i < CLUSTER_N; i++) {                            \
+            CLUSTER_RAFT(i)->io->async_work = ioMethodAsyncWorkFail; \
+        }                                                            \
+    }
+
+static int fsmSnapshotFail(struct raft_fsm *fsm,
+                           struct raft_buffer *bufs[],
+                           unsigned *n_bufs)
+{
+    (void)fsm;
+    (void)bufs;
+    (void)n_bufs;
+    return -1;
+}
+
+#define SET_FAULTY_SNAPSHOT_ASYNC()                                 \
+    {                                                               \
+        unsigned i;                                                 \
+        for (i = 0; i < CLUSTER_N; i++) {                           \
+            CLUSTER_RAFT(i)->fsm->snapshot_async = fsmSnapshotFail; \
+        }                                                           \
+    }
+
+#define RESET_FSM_ASYNC(I)                           \
+    {                                                \
+        struct raft_fsm *fsm = CLUSTER_RAFT(I)->fsm; \
+        FsmClose(fsm);                               \
+        FsmInitAsync(fsm, fsm->version);             \
+    }
+
+#define SET_FAULTY_SNAPSHOT()                                 \
+    {                                                         \
+        unsigned i;                                           \
+        for (i = 0; i < CLUSTER_N; i++) {                     \
+            CLUSTER_RAFT(i)->fsm->snapshot = fsmSnapshotFail; \
+        }                                                     \
+    }
+
+/******************************************************************************
+ *
+ * Successfully install a snapshot
+ *
+ *****************************************************************************/
+
+SUITE(snapshot)
+
+/* Install a snapshot on a follower that has fallen behind. */
+TEST(snapshot, installOne, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+
+    /* Apply a few of entries, to force a snapshot to be taken. */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Reconnect the follower and wait for it to catch up */
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+    CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000);
+
+    /* Check that the leader has sent a snapshot */
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+    munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+    return MUNIT_OK;
+}
+
+/* Install snapshot times out and leader retries */
+TEST(snapshot, installOneTimeOut, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+    SET_SNAPSHOT_TIMEOUT(200);
+
+    /* Apply a few of entries, to force a snapshot to be taken. Drop all network
+     * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be
+     * replicated */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Reconnect both servers and set a high disk latency on server 2 so that
+     * the InstallSnapshot RPC will time out */
+    CLUSTER_SET_DISK_LATENCY(2, 300);
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+
+    /* Wait a while and check that the leader has sent a snapshot */
+    CLUSTER_STEP_UNTIL_ELAPSED(300);
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+    munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+
+    /* Wait for the snapshot to be installed */
+    CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000);
+
+    /* Assert that the leader has retried the InstallSnapshot RPC */
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 2);
+    munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 2);
+
+    return MUNIT_OK;
+}
+
+/* Install snapshot to an offline node */
+TEST(snapshot,
+     installOneDisconnectedFromBeginningReconnects,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+    SET_SNAPSHOT_TIMEOUT(200);
+
+    /* Apply a few of entries, to force a snapshot to be taken. Disconnect
+     * servers 0 and 2 so that the network calls return failure status */
+    CLUSTER_DISCONNECT(0, 2);
+    CLUSTER_DISCONNECT(2, 0);
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Wait a while so leader detects offline node */
+    CLUSTER_STEP_UNTIL_ELAPSED(2000);
+
+    /* Assert that the leader doesn't try sending a snapshot to an offline node
+     */
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 0);
+    munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 0);
+
+    CLUSTER_RECONNECT(0, 2);
+    CLUSTER_RECONNECT(2, 0);
+    /* Wait for the snapshot to be installed */
+    CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000);
+
+    /* Assert that the leader has sent an InstallSnapshot RPC */
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+    munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+
+    return MUNIT_OK;
+}
+
+/* Install snapshot to an offline node that went down during operation */
+TEST(snapshot,
+     installOneDisconnectedDuringOperationReconnects,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+    SET_SNAPSHOT_TIMEOUT(200);
+
+    /* Apply a few of entries */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Wait for follower to catch up*/
+    CLUSTER_STEP_UNTIL_APPLIED(2, 5, 5000);
+    /* Assert that the leader hasn't sent an InstallSnapshot RPC  */
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 0);
+
+    CLUSTER_DISCONNECT(0, 2);
+    CLUSTER_DISCONNECT(2, 0);
+
+    /* Wait a while so leader detects offline node */
+    CLUSTER_STEP_UNTIL_ELAPSED(2000);
+
+    /* Apply a few more entries */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Assert that the leader doesn't try sending snapshot to an offline node */
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 0);
+    munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 0);
+
+    CLUSTER_RECONNECT(0, 2);
+    CLUSTER_RECONNECT(2, 0);
+    CLUSTER_STEP_UNTIL_APPLIED(2, 8, 5000);
+
+    /* Assert that the leader has tried sending an InstallSnapshot RPC */
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+    munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+
+    return MUNIT_OK;
+}
+
+/* No snapshots sent to killed nodes */
+TEST(snapshot, noSnapshotInstallToKilled, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+    SET_SNAPSHOT_TIMEOUT(200);
+
+    /* Kill a server */
+    CLUSTER_KILL(2);
+
+    /* Apply a few of entries */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Wait a while */
+    CLUSTER_STEP_UNTIL_ELAPSED(4000);
+
+    /* Assert that the leader hasn't sent an InstallSnapshot RPC  */
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 0);
+    return MUNIT_OK;
+}
+
+/* Install snapshot times out and leader retries, afterwards AppendEntries
+ * resume */
+TEST(snapshot, installOneTimeOutAppendAfter, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+    SET_SNAPSHOT_TIMEOUT(200);
+
+    /* Apply a few of entries, to force a snapshot to be taken. Drop all network
+     * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be
+     * replicated */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Reconnect both servers and set a high disk latency on server 2 so that
+     * the InstallSnapshot RPC will time out */
+    CLUSTER_SET_DISK_LATENCY(2, 300);
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+
+    /* Wait for the snapshot to be installed */
+    CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000);
+
+    /* Append a few entries and check if they are replicated */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_STEP_UNTIL_APPLIED(2, 5, 5000);
+
+    /* Assert that the leader has retried the InstallSnapshot RPC */
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 2);
+
+    return MUNIT_OK;
+}
+
+/* Install 2 snapshots that both time out and assure the follower catches up */
+TEST(snapshot, installMultipleTimeOut, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+    SET_SNAPSHOT_TIMEOUT(200);
+
+    /* Apply a few of entries, to force a snapshot to be taken. Drop all network
+     * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be
+     * replicated */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Reconnect both servers and set a high disk latency on server 2 so that
+     * the InstallSnapshot RPC will time out */
+    CLUSTER_SET_DISK_LATENCY(2, 300);
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+
+    /* Step until the snapshot times out */
+    CLUSTER_STEP_UNTIL_ELAPSED(400);
+
+    /* Apply another few of entries, to force a new snapshot to be taken. Drop
+     * all traffic between servers 0 and 2 in order for AppendEntries RPCs to
+     * not be replicated */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Reconnect the follower */
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+    CLUSTER_STEP_UNTIL_APPLIED(2, 7, 5000);
+
+    /* Assert that the leader has sent multiple InstallSnapshot RPCs */
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), >=, 2);
+
+    return MUNIT_OK;
+}
+
+/* Install 2 snapshots that both time out, launch a few regular AppendEntries
+ * and assure the follower catches up */
+TEST(snapshot, installMultipleTimeOutAppendAfter, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+    SET_SNAPSHOT_TIMEOUT(200);
+
+    /* Apply a few of entries, to force a snapshot to be taken. Drop all network
+     * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be
+     * replicated */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Reconnect both servers and set a high disk latency on server 2 so that
+     * the InstallSnapshot RPC will time out */
+    CLUSTER_SET_DISK_LATENCY(2, 300);
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+
+    /* Step until the snapshot times out */
+    CLUSTER_STEP_UNTIL_ELAPSED(400);
+
+    /* Apply another few of entries, to force a new snapshot to be taken. Drop
+     * all traffic between servers 0 and 2 in order for AppendEntries RPCs to
+     * not be replicated */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Reconnect the follower */
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+    /* Append a few entries and make sure the follower catches up */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_STEP_UNTIL_APPLIED(2, 9, 5000);
+
+    /* Assert that the leader has sent multiple InstallSnapshot RPCs */
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), >=, 2);
+
+    return MUNIT_OK;
+}
+
+static bool server_installing_snapshot(struct raft_fixture *f, void *data)
+{
+    (void)f;
+    const struct raft *r = data;
+    return r->snapshot.put.data != NULL && r->last_stored == 0;
+}
+
+static bool server_taking_snapshot(struct raft_fixture *f, void *data)
+{
+    (void)f;
+    const struct raft *r = data;
+    return r->snapshot.put.data != NULL && r->last_stored != 0;
+}
+
+static bool server_snapshot_done(struct raft_fixture *f, void *data)
+{
+    (void)f;
+    const struct raft *r = data;
+    return r->snapshot.put.data == NULL;
+}
+
+/* Follower receives HeartBeats during the installation of a snapshot */
+TEST(snapshot, installSnapshotHeartBeats, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+
+    /* Apply a few of entries, to force a snapshot to be taken. */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Set a large disk latency on the follower, this will allow some
+     * heartbeats to be sent during the snapshot installation */
+    CLUSTER_SET_DISK_LATENCY(1, 2000);
+
+    munit_assert_uint(CLUSTER_N_RECV(1, RAFT_IO_INSTALL_SNAPSHOT), ==, 0);
+
+    /* Step the cluster until server 1 installs a snapshot */
+    const struct raft *r = CLUSTER_RAFT(1);
+    CLUSTER_DESATURATE_BOTHWAYS(0, 1);
+    CLUSTER_STEP_UNTIL(server_installing_snapshot, (void *)r, 2000);
+    munit_assert_uint(CLUSTER_N_RECV(1, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+
+    /* Count the number of AppendEntries RPCs received during the snapshot
+     * install*/
+    unsigned before = CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES);
+    CLUSTER_STEP_UNTIL(server_snapshot_done, (void *)r, 5000);
+    unsigned after = CLUSTER_N_RECV(1, RAFT_IO_APPEND_ENTRIES);
+    munit_assert_uint(before, <, after);
+
+    /* Check that the InstallSnapshot RPC was not resent */
+    munit_assert_uint(CLUSTER_N_RECV(1, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+
+    /* Check that the snapshot was applied and we can still make progress */
+    CLUSTER_STEP_UNTIL_APPLIED(1, 4, 5000);
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_STEP_UNTIL_APPLIED(1, 6, 5000);
+
+    return MUNIT_OK;
+}
+
+/* InstallSnapshot RPC arrives while persisting Entries */
+TEST(snapshot, installSnapshotDuringEntriesWrite, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set a large disk latency on the follower, this will allow a
+     * InstallSnapshot RPC to arrive while the entries are being persisted. */
+    CLUSTER_SET_DISK_LATENCY(1, 2000);
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+
+    /* Replicate some entries, these will take a while to persist */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Make sure leader can't succesfully send any more entries */
+    CLUSTER_DISCONNECT(0, 1);
+    CLUSTER_MAKE_PROGRESS; /* Snapshot taken here */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS; /* Snapshot taken here */
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Snapshot with index 6 is sent while follower is still writing the entries
+     * to disk that arrived before the disconnect. */
+    CLUSTER_RECONNECT(0, 1);
+
+    /* Make sure follower is up to date */
+    CLUSTER_STEP_UNTIL_APPLIED(1, 7, 5000);
+    return MUNIT_OK;
+}
+
+static char *fsm_version[] = {"1", "2", "3", NULL};
+static char *fsm_snapshot_async[] = {"0", "1", NULL};
+static MunitParameterEnum fsm_snapshot_async_params[] = {
+    {CLUSTER_SS_ASYNC_PARAM, fsm_snapshot_async},
+    {CLUSTER_FSM_VERSION_PARAM, fsm_version},
+    {NULL, NULL},
+};
+
+static char *fsm_snapshot_only_async[] = {"1", NULL};
+static char *fsm_version_only_async[] = {"3", NULL};
+static MunitParameterEnum fsm_snapshot_only_async_params[] = {
+    {CLUSTER_SS_ASYNC_PARAM, fsm_snapshot_only_async},
+    {CLUSTER_FSM_VERSION_PARAM, fsm_version_only_async},
+    {NULL, NULL},
+};
+
+/* Follower receives AppendEntries RPCs while taking a snapshot */
+TEST(snapshot,
+     takeSnapshotAppendEntries,
+     setUp,
+     tearDown,
+     0,
+     fsm_snapshot_async_params)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+
+    /* Set a large disk latency on the follower, this will allow AppendEntries
+     * to be sent while a snapshot is taken */
+    CLUSTER_SET_DISK_LATENCY(1, 2000);
+
+    /* Apply a few of entries, to force a snapshot to be taken. */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Step the cluster until server 1 takes a snapshot */
+    const struct raft *r = CLUSTER_RAFT(1);
+    CLUSTER_STEP_UNTIL(server_taking_snapshot, (void *)r, 3000);
+
+    /* Send AppendEntries RPCs while server 1 is taking a snapshot */
+    static struct raft_apply reqs[5];
+    for (int i = 0; i < 5; i++) {
+        CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, &reqs[i], 1, NULL);
+    }
+    CLUSTER_STEP_UNTIL(server_snapshot_done, (void *)r, 5000);
+
+    /* Make sure the AppendEntries are applied and we can make progress */
+    CLUSTER_STEP_UNTIL_APPLIED(1, 9, 5000);
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_STEP_UNTIL_APPLIED(1, 11, 5000);
+    return MUNIT_OK;
+}
+
+TEST(snapshot,
+     takeSnapshotSnapshotPutFail,
+     setUp,
+     tearDown,
+     0,
+     fsm_snapshot_async_params)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    SET_FAULTY_SNAPSHOT_PUT();
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+
+    /* Apply a few of entries, to force a snapshot to be taken. */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* No crash or leaks have occurred */
+    return MUNIT_OK;
+}
+
+TEST(snapshot,
+     takeSnapshotAsyncWorkFail,
+     setUp,
+     tearDown,
+     0,
+     fsm_snapshot_async_params)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    SET_FAULTY_ASYNC_WORK();
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+
+    /* Apply a few of entries, to force a snapshot to be taken. */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* No crash or leaks have occurred */
+    return MUNIT_OK;
+}
+
+TEST(snapshot,
+     takeSnapshotAsyncFail,
+     setUp,
+     tearDown,
+     0,
+     fsm_snapshot_only_async_params)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    SET_FAULTY_SNAPSHOT_ASYNC();
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+
+    /* Apply a few of entries, to force a snapshot to be taken. */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* No crash or leaks have occurred */
+    return MUNIT_OK;
+}
+
+TEST(snapshot,
+     takeSnapshotAsyncFailOnce,
+     setUp,
+     tearDown,
+     0,
+     fsm_snapshot_only_async_params)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    SET_FAULTY_SNAPSHOT_ASYNC();
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+
+    /* Apply a few of entries, to force a snapshot to be taken. */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    /* Wait for snapshot to fail. */
+    CLUSTER_STEP_UNTIL_ELAPSED(200);
+    /* napshot will have failed here. */
+
+    /* Set the non-faulty fsm->snapshot_async function */
+    RESET_FSM_ASYNC(CLUSTER_LEADER);
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Wait for snapshot to be finished */
+    CLUSTER_STEP_UNTIL_ELAPSED(200);
+
+    /* Reconnect the follower and wait for it to catch up */
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+    CLUSTER_STEP_UNTIL_APPLIED(2, 4, 5000);
+
+    /* Check that the leader has sent a snapshot */
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+    munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+    return MUNIT_OK;
+}
+
+TEST(snapshot, takeSnapshotFail, setUp, tearDown, 0, fsm_snapshot_async_params)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    SET_FAULTY_SNAPSHOT();
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+
+    /* Apply a few of entries, to force a snapshot to be taken. */
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* No crash or leaks have occurred */
+    return MUNIT_OK;
+}
+
+/* A follower doesn't convert to candidate state while it's installing a
+ * snapshot. */
+TEST(snapshot, snapshotBlocksCandidate, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+
+    /* Apply a few of entries, to force a snapshot to be taken. Drop all network
+     * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be
+     * replicated */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Reconnect both servers and set a high disk latency on server 2 */
+    CLUSTER_SET_DISK_LATENCY(2, 5000);
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+
+    /* Wait a while and check that the leader has sent a snapshot */
+    CLUSTER_STEP_UNTIL_ELAPSED(500);
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+    munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+
+    /* Disconnect the servers again so that heartbeats, etc. won't arrive */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    munit_assert_int(CLUSTER_STATE(2), ==, RAFT_FOLLOWER);
+    munit_assert_ptr(CLUSTER_RAFT(2)->snapshot.put.data, !=, NULL);
+    CLUSTER_STEP_UNTIL_ELAPSED(4000);
+    munit_assert_int(CLUSTER_STATE(2), ==, RAFT_FOLLOWER);
+    return MUNIT_OK;
+}
+
+/* An UNAVAILABLE node doesn't install snapshots. */
+TEST(snapshot, unavailableDiscardsSnapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+
+    /* Apply a few of entries, to force a snapshot to be taken. Drop all network
+     * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be
+     * replicated */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Reconnect both servers */
+    CLUSTER_SET_DISK_LATENCY(2, 600);
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+
+    /* Wait a while and check that the leader has sent a snapshot */
+    CLUSTER_STEP_UNTIL_ELAPSED(500);
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+    munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+    raft_fixture_make_unavailable(&f->cluster, 2);
+    CLUSTER_STEP_UNTIL_ELAPSED(500);
+    munit_assert_uint64(raft_last_applied(CLUSTER_RAFT(2)), ==, 1);
+    return MUNIT_OK;
+}
+
+/* A new term starts while a node is installing a snapshot. */
+TEST(snapshot, newTermWhileInstalling, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    /* Set very low threshold and trailing entries number */
+    SET_SNAPSHOT_THRESHOLD(3);
+    SET_SNAPSHOT_TRAILING(1);
+
+    /* Apply a few of entries, to force a snapshot to be taken. Drop all network
+     * traffic between servers 0 and 2 in order for AppendEntries RPCs to not be
+     * replicated */
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+    CLUSTER_MAKE_PROGRESS;
+
+    /* Reconnect both servers */
+    CLUSTER_SET_DISK_LATENCY(2, 3000);
+    CLUSTER_DESATURATE_BOTHWAYS(0, 2);
+    /* Wait a while and check that the leader has sent a snapshot */
+    CLUSTER_STEP_UNTIL_ELAPSED(500);
+    munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+    munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
+    /* Force a new term to start */
+    CLUSTER_DEPOSE;
+    CLUSTER_ELECT(1);
+    CLUSTER_STEP_UNTIL_ELAPSED(1000);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_start.c b/test/raft/integration/test_start.c
new file mode 100644
index 000000000..d49cf2c88
--- /dev/null
+++ b/test/raft/integration/test_start.c
@@ -0,0 +1,223 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture with a fake raft_io instance.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+/* Bootstrap the I'th server. */
+#define BOOTSTRAP(I)                                  \
+    do {                                              \
+        struct raft_configuration _configuration;     \
+        int _rv;                                      \
+        struct raft *_raft;                           \
+        CLUSTER_CONFIGURATION(&_configuration);       \
+        _raft = CLUSTER_RAFT(I);                      \
+        _rv = raft_bootstrap(_raft, &_configuration); \
+        munit_assert_int(_rv, ==, 0);                 \
+        raft_configuration_close(&_configuration);    \
+    } while (0)
+
+/******************************************************************************
+ *
+ * Set up a cluster with a single server.
+ *
+ *****************************************************************************/
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(1);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * raft_start
+ *
+ *****************************************************************************/
+
+SUITE(raft_start)
+
+/* There are two servers. The first has a snapshot present and no other
+ * entries. */
+TEST(raft_start, oneSnapshotAndNoEntries, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_GROW;
+    CLUSTER_SET_SNAPSHOT(0 /* server index                                  */,
+                         6 /* last index                                    */,
+                         2 /* last term                                     */,
+                         1 /* conf index                                    */,
+                         5 /* x                                             */,
+                         7 /* y                                             */);
+    CLUSTER_SET_TERM(0, 2);
+    BOOTSTRAP(1);
+    CLUSTER_START;
+    CLUSTER_MAKE_PROGRESS;
+    return MUNIT_OK;
+}
+
+/* There are two servers. The first has a snapshot along with some follow-up
+ * entries. */
+TEST(raft_start, oneSnapshotAndSomeFollowUpEntries, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entries[2];
+    struct raft_fsm *fsm;
+
+    CLUSTER_GROW;
+    BOOTSTRAP(1);
+
+    entries[0].type = RAFT_COMMAND;
+    entries[0].term = 2;
+    FsmEncodeSetX(6, &entries[0].buf);
+
+    entries[1].type = RAFT_COMMAND;
+    entries[1].term = 2;
+    FsmEncodeAddY(2, &entries[1].buf);
+
+    CLUSTER_SET_SNAPSHOT(0 /*                                               */,
+                         6 /* last index                                    */,
+                         2 /* last term                                     */,
+                         1 /* conf index                                    */,
+                         5 /* x                                             */,
+                         7 /* y                                             */);
+    CLUSTER_ADD_ENTRY(0, &entries[0]);
+    CLUSTER_ADD_ENTRY(1, &entries[1]);
+    CLUSTER_SET_TERM(0, 2);
+
+    CLUSTER_START;
+    CLUSTER_MAKE_PROGRESS;
+
+    fsm = CLUSTER_FSM(0);
+    munit_assert_int(FsmGetX(fsm), ==, 7);
+
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * Start with entries present on disk.
+ *
+ *****************************************************************************/
+
+/* There are 3 servers. The first has no entries are present at all */
+TEST(raft_start, noEntries, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_GROW;
+    CLUSTER_GROW;
+    BOOTSTRAP(1);
+    BOOTSTRAP(2);
+    CLUSTER_START;
+    CLUSTER_MAKE_PROGRESS;
+    return MUNIT_OK;
+}
+
+/* There are 3 servers, the first has some entries, the others don't. */
+TEST(raft_start, twoEntries, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_configuration configuration;
+    struct raft_entry entry;
+    struct raft_fsm *fsm;
+    unsigned i;
+    int rv;
+
+    CLUSTER_GROW;
+    CLUSTER_GROW;
+
+    CLUSTER_CONFIGURATION(&configuration);
+    rv = raft_bootstrap(CLUSTER_RAFT(0), &configuration);
+    munit_assert_int(rv, ==, 0);
+    raft_configuration_close(&configuration);
+
+    entry.type = RAFT_COMMAND;
+    entry.term = 3;
+    FsmEncodeSetX(123, &entry.buf);
+
+    CLUSTER_ADD_ENTRY(0, &entry);
+    CLUSTER_SET_TERM(0, 3);
+
+    BOOTSTRAP(1);
+    BOOTSTRAP(2);
+
+    CLUSTER_START;
+    CLUSTER_ELECT(0);
+    CLUSTER_MAKE_PROGRESS;
+
+    CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_N, 3, 3000);
+
+    for (i = 0; i < CLUSTER_N; i++) {
+        fsm = CLUSTER_FSM(i);
+        munit_assert_int(FsmGetX(fsm), ==, 124);
+    }
+
+    return MUNIT_OK;
+}
+
+/* There is a single voting server in the cluster, which immediately elects
+ * itself when starting. */
+TEST(raft_start, singleVotingSelfElect, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+    munit_assert_int(CLUSTER_STATE(0), ==, RAFT_LEADER);
+    CLUSTER_MAKE_PROGRESS;
+    return MUNIT_OK;
+}
+
+/* There are two servers in the cluster, one is voting and the other is
+ * not. When started, the non-voting server does not elects itself. */
+TEST(raft_start, singleVotingNotUs, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_GROW;
+    CLUSTER_BOOTSTRAP_N_VOTING(1);
+    CLUSTER_START;
+    munit_assert_int(CLUSTER_STATE(1), ==, RAFT_FOLLOWER);
+    CLUSTER_MAKE_PROGRESS;
+    return MUNIT_OK;
+}
+
+static void state_cb(struct raft *r, unsigned short old, unsigned short new)
+{
+    munit_assert_ushort(old, !=, new);
+    r->data = (void *)(uintptr_t)0xFEEDBEEF;
+}
+
+/* There is a single voting server in the cluster, register a state_cb and
+ * assert that it's called because the node will progress to leader.  */
+TEST(raft_start, singleVotingWithStateCb, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_BOOTSTRAP;
+    struct raft *r = CLUSTER_RAFT(0);
+    r->data = (void *)(uintptr_t)0;
+    raft_register_state_cb(r, state_cb);
+    CLUSTER_START;
+    munit_assert_uint((uintptr_t)r->data, ==, 0xFEEDBEEF);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_strerror.c b/test/raft/integration/test_strerror.c
new file mode 100644
index 000000000..ae45e1867
--- /dev/null
+++ b/test/raft/integration/test_strerror.c
@@ -0,0 +1,49 @@
+#include "../../../src/raft.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * raft_strerror
+ *
+ *****************************************************************************/
+
+SUITE(raft_strerror)
+
+#define ERR_CODE_MAP(X)      \
+    X(RAFT_NOMEM)            \
+    X(RAFT_BADID)            \
+    X(RAFT_DUPLICATEID)      \
+    X(RAFT_DUPLICATEADDRESS) \
+    X(RAFT_BADROLE)          \
+    X(RAFT_MALFORMED)        \
+    X(RAFT_NOTLEADER)        \
+    X(RAFT_LEADERSHIPLOST)   \
+    X(RAFT_SHUTDOWN)         \
+    X(RAFT_CANTBOOTSTRAP)    \
+    X(RAFT_CANTCHANGE)       \
+    X(RAFT_CORRUPT)          \
+    X(RAFT_CANCELED)         \
+    X(RAFT_NAMETOOLONG)      \
+    X(RAFT_TOOBIG)           \
+    X(RAFT_NOCONNECTION)     \
+    X(RAFT_BUSY)             \
+    X(RAFT_IOERR)
+
+#define TEST_CASE_STRERROR(CODE)                    \
+    TEST(raft_strerror, CODE, NULL, NULL, 0, NULL)  \
+    {                                               \
+        (void)data;                                 \
+        (void)params;                               \
+        munit_assert_not_null(raft_strerror(CODE)); \
+        return MUNIT_OK;                            \
+    }
+
+ERR_CODE_MAP(TEST_CASE_STRERROR)
+
+TEST(raft_strerror, default, NULL, NULL, 0, NULL)
+{
+    (void)data;
+    (void)params;
+    munit_assert_string_equal(raft_strerror(666), "unknown error");
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_tick.c b/test/raft/integration/test_tick.c
new file mode 100644
index 000000000..807518b91
--- /dev/null
+++ b/test/raft/integration/test_tick.c
@@ -0,0 +1,261 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    const char *n_voting_param = munit_parameters_get(params, "n_voting");
+    unsigned n = 3;
+    unsigned n_voting = n;
+    if (n_voting_param != NULL) {
+        n_voting = atoi(n_voting_param);
+    }
+    SETUP_CLUSTER(n);
+    CLUSTER_BOOTSTRAP_N_VOTING(n_voting);
+    CLUSTER_START;
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Assertions
+ *
+ *****************************************************************************/
+
+/* Assert the current value of the timer of the I'th raft instance */
+#define ASSERT_ELECTION_TIMER(I, MSECS)                                   \
+    {                                                                     \
+        struct raft *raft_ = CLUSTER_RAFT(I);                             \
+        munit_assert_int(                                                 \
+            raft_->io->time(raft_->io) - raft_->election_timer_start, ==, \
+            MSECS);                                                       \
+    }
+
+/* Assert the current state of the I'th raft instance.  */
+#define ASSERT_STATE(I, STATE) munit_assert_int(CLUSTER_STATE(I), ==, STATE);
+
+/******************************************************************************
+ *
+ * Tick callback
+ *
+ *****************************************************************************/
+
+SUITE(tick)
+
+/* Internal timers are updated according to the given time delta. */
+TEST(tick, electionTimer, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    CLUSTER_STEP;
+    ASSERT_ELECTION_TIMER(0, 100);
+
+    CLUSTER_STEP;
+    ASSERT_ELECTION_TIMER(1, 100);
+
+    CLUSTER_STEP;
+    ASSERT_ELECTION_TIMER(2, 100);
+
+    CLUSTER_STEP;
+    ASSERT_ELECTION_TIMER(0, 200);
+
+    return MUNIT_OK;
+}
+
+/* If the election timeout expires, the follower is a voting server, and it
+ * hasn't voted yet in this term, then become candidate and start a new
+ * election. */
+TEST(tick, candidate, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft = CLUSTER_RAFT(0);
+    (void)params;
+
+    CLUSTER_STEP_UNTIL_ELAPSED(
+        raft->follower_state.randomized_election_timeout);
+
+    /* The term has been incremented. */
+    munit_assert_int(raft->current_term, ==, 2);
+
+    /* We have voted for ourselves. */
+    munit_assert_int(raft->voted_for, ==, 1);
+
+    /* We are candidate */
+    ASSERT_STATE(0, RAFT_CANDIDATE);
+
+    /* The votes array is initialized */
+    munit_assert_ptr_not_null(raft->candidate_state.votes);
+    munit_assert_true(raft->candidate_state.votes[0]);
+    munit_assert_false(raft->candidate_state.votes[1]);
+
+    return MUNIT_OK;
+}
+
+/* If the election timeout has not elapsed, stay follower. */
+TEST(tick, electionTimerNotExpired, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft = CLUSTER_RAFT(0);
+    (void)params;
+
+    CLUSTER_STEP_UNTIL_ELAPSED(
+        raft->follower_state.randomized_election_timeout - 100);
+    ASSERT_STATE(0, RAFT_FOLLOWER);
+
+    return MUNIT_OK;
+}
+
+static char *elapse_non_voter_n_voting[] = {"1", NULL};
+
+static MunitParameterEnum elapse_non_voter_params[] = {
+    {"n_voting", elapse_non_voter_n_voting},
+    {NULL, NULL},
+};
+
+/* If the election timeout has elapsed, but we're not voters, stay follower. */
+TEST(tick, not_voter, setUp, tearDown, 0, elapse_non_voter_params)
+{
+    struct fixture *f = data;
+    struct raft *raft = CLUSTER_RAFT(1);
+    (void)params;
+
+    /* Prevent the timer of the first server from expiring. */
+    raft_fixture_set_randomized_election_timeout(&f->cluster, 0, 2000);
+    raft_set_election_timeout(CLUSTER_RAFT(0), 2000);
+
+    CLUSTER_STEP_UNTIL_ELAPSED(
+        raft->follower_state.randomized_election_timeout + 100);
+    ASSERT_STATE(1, RAFT_FOLLOWER);
+
+    return MUNIT_OK;
+}
+
+/* If we're leader election timeout elapses without hearing from a majority of
+ * the cluster, step down. */
+TEST(tick, no_contact, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    (void)params;
+
+    CLUSTER_ELECT(0);
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+
+    /* Wait for the leader to step down. */
+    CLUSTER_STEP_UNTIL_STATE_IS(0, RAFT_FOLLOWER, 2000);
+
+    return MUNIT_OK;
+}
+
+/* If we're candidate and the election timeout has elapsed, start a new
+ * election. */
+TEST(tick, new_election, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft = CLUSTER_RAFT(0);
+
+    (void)params;
+
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+
+    /* Become candidate */
+    CLUSTER_STEP_UNTIL_ELAPSED(
+        raft->follower_state.randomized_election_timeout);
+
+    /* Expire the election timeout */
+    CLUSTER_STEP_UNTIL_ELAPSED(
+        raft->candidate_state.randomized_election_timeout);
+
+    /* The term has been incremented and saved to stable store. */
+    munit_assert_int(raft->current_term, ==, 3);
+
+    /* We have voted for ourselves. */
+    munit_assert_int(raft->voted_for, ==, 1);
+
+    /* We are still candidate */
+    ASSERT_STATE(0, RAFT_CANDIDATE);
+
+    /* The votes array is initialized */
+    munit_assert_ptr_not_null(raft->candidate_state.votes);
+    munit_assert_true(raft->candidate_state.votes[0]);
+    munit_assert_false(raft->candidate_state.votes[1]);
+
+    return MUNIT_OK;
+}
+
+/* If the election timeout has not elapsed, stay candidate. */
+TEST(tick, during_election, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft *raft = CLUSTER_RAFT(0);
+    (void)params;
+
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+
+    /* Become candidate */
+    CLUSTER_STEP_UNTIL_ELAPSED(
+        raft->follower_state.randomized_election_timeout);
+
+    /* Make some time elapse, but not enough to trigger the timeout */
+    CLUSTER_STEP_UNTIL_ELAPSED(
+        raft->candidate_state.randomized_election_timeout - 100);
+
+    /* We are still candidate at the same term */
+    ASSERT_STATE(0, RAFT_CANDIDATE);
+    munit_assert_int(raft->current_term, ==, 2);
+
+    return MUNIT_OK;
+}
+
+static char *elapse_request_vote_only_to_voters_n_voting[] = {"2", NULL};
+
+static MunitParameterEnum elapse_request_vote_only_to_voters_params[] = {
+    {"n_voting", elapse_request_vote_only_to_voters_n_voting},
+    {NULL, NULL},
+};
+
+/* Vote requests are sent only to voting servers. */
+TEST(tick,
+     request_vote_only_to_voters,
+     setUp,
+     tearDown,
+     0,
+     elapse_request_vote_only_to_voters_params)
+{
+    struct fixture *f = data;
+    struct raft *raft = CLUSTER_RAFT(0);
+    (void)params;
+
+    CLUSTER_SATURATE_BOTHWAYS(0, 1);
+    CLUSTER_SATURATE_BOTHWAYS(0, 2);
+
+    /* Become candidate */
+    CLUSTER_STEP_UNTIL_ELAPSED(
+        raft->follower_state.randomized_election_timeout);
+
+    /* We have sent vote requests only to the voting server */
+    //__assert_request_vote(f, 2, 2, 1, 1);
+
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_transfer.c b/test/raft/integration/test_transfer.c
new file mode 100644
index 000000000..a51d70898
--- /dev/null
+++ b/test/raft/integration/test_transfer.c
@@ -0,0 +1,209 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture with a test raft cluster.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+static void transferCb(struct raft_transfer *req)
+{
+    bool *done = req->data;
+    munit_assert_false(*done);
+    *done = true;
+}
+
+static bool transferCbHasFired(struct raft_fixture *f, void *arg)
+{
+    bool *done = arg;
+    (void)f;
+    return *done;
+}
+
+/* Submit a transfer leadership request against the I'th server. */
+#define TRANSFER_SUBMIT(I, ID)                         \
+    struct raft *_raft = CLUSTER_RAFT(I);              \
+    struct raft_transfer _req;                         \
+    bool _done = false;                                \
+    int _rv;                                           \
+    _req.data = &_done;                                \
+    _rv = raft_transfer(_raft, &_req, ID, transferCb); \
+    munit_assert_int(_rv, ==, 0);
+
+/* Wait until the transfer leadership request completes. */
+#define TRANSFER_WAIT CLUSTER_STEP_UNTIL(transferCbHasFired, &_done, 2000)
+
+/* Submit a transfer leadership request and wait for it to complete. */
+#define TRANSFER(I, ID)         \
+    do {                        \
+        TRANSFER_SUBMIT(I, ID); \
+        TRANSFER_WAIT;          \
+    } while (0)
+
+/* Submit a transfer leadership request against the I'th server and assert that
+ * the given error is returned. */
+#define TRANSFER_ERROR(I, ID, RV, ERRMSG)                        \
+    do {                                                         \
+        struct raft_transfer __req;                              \
+        int __rv;                                                \
+        __rv = raft_transfer(CLUSTER_RAFT(I), &__req, ID, NULL); \
+        munit_assert_int(__rv, ==, RV);                          \
+        munit_assert_string_equal(CLUSTER_ERRMSG(I), ERRMSG);    \
+    } while (0)
+
+/******************************************************************************
+ *
+ * Set up a cluster with a three servers.
+ *
+ *****************************************************************************/
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(3);
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+    CLUSTER_ELECT(0);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * raft_transfer
+ *
+ *****************************************************************************/
+
+SUITE(raft_transfer)
+
+/* The follower we ask to transfer leadership to is up-to-date. */
+TEST(raft_transfer, upToDate, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    TRANSFER(0, 2);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(1000);
+    munit_assert_int(CLUSTER_LEADER, ==, 1);
+    return MUNIT_OK;
+}
+
+/* The follower we ask to transfer leadership to needs to catch up. */
+TEST(raft_transfer, catchUp, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply req;
+    CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, &req, 1, NULL);
+    TRANSFER(0, 2);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(1000);
+    munit_assert_int(CLUSTER_LEADER, ==, 1);
+    return MUNIT_OK;
+}
+
+/* The follower we ask to transfer leadership to is down and the leadership
+ * transfer does not succeed. */
+TEST(raft_transfer, expire, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_apply req;
+    CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, &req, 1, NULL);
+    CLUSTER_KILL(1);
+    TRANSFER(0, 2);
+    munit_assert_int(CLUSTER_LEADER, ==, 0);
+    return MUNIT_OK;
+}
+
+/* The given ID doesn't match any server in the current configuration. */
+TEST(raft_transfer, unknownServer, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    TRANSFER_ERROR(0, 4, RAFT_BADID, "server ID is not valid");
+    return MUNIT_OK;
+}
+
+/* Submitting a transfer request twice is an error. */
+TEST(raft_transfer, twice, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    TRANSFER_SUBMIT(0, 2);
+    TRANSFER_ERROR(0, 3, RAFT_NOTLEADER, "server is not the leader");
+    TRANSFER_WAIT;
+    return MUNIT_OK;
+}
+
+/* If the given ID is zero, the target is selected automatically. */
+TEST(raft_transfer, autoSelect, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    TRANSFER(0, 0);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(1000);
+    munit_assert_int(CLUSTER_LEADER, !=, 0);
+    return MUNIT_OK;
+}
+
+/* If the given ID is zero, the target is selected automatically. Followers that
+ * are up-to-date are preferred. */
+TEST(raft_transfer, autoSelectUpToDate, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CLUSTER_KILL(1);
+    CLUSTER_MAKE_PROGRESS;
+    TRANSFER(0, 0);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(1000);
+    munit_assert_int(CLUSTER_LEADER, ==, 2);
+    return MUNIT_OK;
+}
+
+/* It's not possible to transfer leadership after the server has been
+ * demoted. */
+TEST(raft_transfer, afterDemotion, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_change req;
+    struct raft *raft = CLUSTER_RAFT(0);
+    int rv;
+    CLUSTER_ADD(&req);
+    CLUSTER_STEP_UNTIL_APPLIED(0, 3, 1000);
+    CLUSTER_ASSIGN(&req, RAFT_VOTER);
+    CLUSTER_STEP_UNTIL_APPLIED(0, 4, 1000);
+    rv = raft_assign(raft, &req, raft->id, RAFT_SPARE, NULL);
+    munit_assert_int(rv, ==, 0);
+    CLUSTER_STEP_UNTIL_APPLIED(0, 5, 1000);
+    TRANSFER_ERROR(0, 2, RAFT_NOTLEADER, "server is not the leader");
+    return MUNIT_OK;
+}
+
+static char *cluster_pre_vote[] = {"0", "1", NULL};
+static char *cluster_heartbeat[] = {"1", "100", NULL};
+
+static MunitParameterEnum _params[] = {
+    {CLUSTER_PRE_VOTE_PARAM, cluster_pre_vote},
+    {CLUSTER_HEARTBEAT_PARAM, cluster_heartbeat},
+    {NULL, NULL},
+};
+
+/* It's possible to transfer leadership also when pre-vote is active */
+TEST(raft_transfer, preVote, setUp, tearDown, 0, _params)
+{
+    struct fixture *f = data;
+    TRANSFER(0, 2);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(1000);
+    munit_assert_int(CLUSTER_LEADER, ==, 1);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_uv_append.c b/test/raft/integration/test_uv_append.c
new file mode 100644
index 000000000..11ff501ef
--- /dev/null
+++ b/test/raft/integration/test_uv_append.c
@@ -0,0 +1,1005 @@
+#include "../../../src/raft/uv.h"
+#include "../lib/aio.h"
+#include "../lib/runner.h"
+#include "../lib/uv.h"
+#include "append_helpers.h"
+
+#include <unistd.h>
+
+/* Maximum number of blocks a segment can have */
+#define MAX_SEGMENT_BLOCKS 4
+
+/* This block size should work fine for all file systems. */
+#define SEGMENT_BLOCK_SIZE 4096
+
+/* Default segment size */
+#define SEGMENT_SIZE 4096 * MAX_SEGMENT_BLOCKS
+
+/******************************************************************************
+ *
+ * Fixture with a libuv-based raft_io instance.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_UV_DEPS;
+    FIXTURE_UV;
+    int count; /* To generate deterministic entry data */
+};
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_UV_DEPS;
+    SETUP_UV;
+    raft_uv_set_block_size(&f->io, SEGMENT_BLOCK_SIZE);
+    raft_uv_set_segment_size(&f->io, SEGMENT_SIZE);
+    f->count = 0;
+    return f;
+}
+
+static void tearDownDeps(void *data)
+{
+    struct fixture *f = data;
+    if (f == NULL) {
+        return;
+    }
+    TEAR_DOWN_UV_DEPS;
+    free(f);
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    if (f == NULL) {
+        return;
+    }
+    TEAR_DOWN_UV;
+    tearDownDeps(f);
+}
+
+/******************************************************************************
+ *
+ * Assertions
+ *
+ *****************************************************************************/
+
+/* Shutdown the fixture's raft_io instance, then load all entries on disk using
+ * a new raft_io instance, and assert that there are N entries with a total data
+ * size of TOTAL_DATA_SIZE bytes. */
+#define ASSERT_ENTRIES(N, TOTAL_DATA_SIZE)                                   \
+    TEAR_DOWN_UV;                                                            \
+    do {                                                                     \
+        struct uv_loop_s _loop;                                              \
+        struct raft_uv_transport _transport;                                 \
+        struct raft_io _io;                                                  \
+        raft_term _term;                                                     \
+        raft_id _voted_for;                                                  \
+        struct raft_snapshot *_snapshot;                                     \
+        raft_index _start_index;                                             \
+        struct raft_entry *_entries;                                         \
+        size_t _i;                                                           \
+        size_t _n;                                                           \
+        void *_batch = NULL;                                                 \
+        size_t _total_data_size = 0;                                         \
+        int _rv;                                                             \
+                                                                             \
+        _rv = uv_loop_init(&_loop);                                          \
+        munit_assert_int(_rv, ==, 0);                                        \
+        _transport.version = 1;                                              \
+        _rv = raft_uv_tcp_init(&_transport, &_loop);                         \
+        munit_assert_int(_rv, ==, 0);                                        \
+        _rv = raft_uv_init(&_io, &_loop, f->dir, &_transport);               \
+        munit_assert_int(_rv, ==, 0);                                        \
+        _rv = _io.init(&_io, 1, "1");                                        \
+        if (_rv != 0) {                                                      \
+            munit_errorf("io->init(): %s (%d)", _io.errmsg, _rv);            \
+        }                                                                    \
+        _rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \
+                       &_entries, &_n);                                      \
+        if (_rv != 0) {                                                      \
+            munit_errorf("io->load(): %s (%d)", _io.errmsg, _rv);            \
+        }                                                                    \
+        _io.close(&_io, NULL);                                               \
+        uv_run(&_loop, UV_RUN_NOWAIT);                                       \
+        raft_uv_close(&_io);                                                 \
+        raft_uv_tcp_close(&_transport);                                      \
+        uv_loop_close(&_loop);                                               \
+                                                                             \
+        munit_assert_ptr_null(_snapshot);                                    \
+        munit_assert_int(_n, ==, N);                                         \
+        for (_i = 0; _i < _n; _i++) {                                        \
+            struct raft_entry *_entry = &_entries[_i];                       \
+            uint64_t _value = *(uint64_t *)_entry->buf.base;                 \
+            munit_assert_int(_entry->term, ==, 1);                           \
+            munit_assert_int(_entry->type, ==, RAFT_COMMAND);                \
+            munit_assert_int(_value, ==, _i);                                \
+            munit_assert_ptr_not_null(_entry->batch);                        \
+        }                                                                    \
+        for (_i = 0; _i < _n; _i++) {                                        \
+            struct raft_entry *_entry = &_entries[_i];                       \
+            if (_entry->batch != _batch) {                                   \
+                _batch = _entry->batch;                                      \
+                raft_free(_batch);                                           \
+            }                                                                \
+            _total_data_size += _entry->buf.len;                             \
+        }                                                                    \
+        raft_free(_entries);                                                 \
+        munit_assert_int(_total_data_size, ==, TOTAL_DATA_SIZE);             \
+    } while (0);
+
+/******************************************************************************
+ *
+ * raft_io->append()
+ *
+ *****************************************************************************/
+
+SUITE(append)
+
+/* Append an entries array containing unaligned buffers. */
+TEST(append, unaligned, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_SUBMIT_CB_DATA(0, 1, 9, NULL, NULL, RAFT_INVALID);
+    munit_assert_string_equal(f->io.errmsg,
+                              "entry buffers must be 8-byte aligned");
+    APPEND_SUBMIT_CB_DATA(1, 3, 63, NULL, NULL, RAFT_INVALID);
+    munit_assert_string_equal(f->io.errmsg,
+                              "entry buffers must be 8-byte aligned");
+    return MUNIT_OK;
+}
+
+/* Append the very first batch of entries. */
+TEST(append, first, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1, 64);
+    ASSERT_ENTRIES(1, 64);
+    return MUNIT_OK;
+}
+
+/* As soon as the backend starts writing the first open segment, a second one
+ * and a third one get prepared. */
+TEST(append, prepareSegments, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1, 64);
+    while (!DirHasFile(f->dir, "open-3")) {
+        LOOP_RUN(1);
+    }
+    munit_assert_true(DirHasFile(f->dir, "open-1"));
+    munit_assert_true(DirHasFile(f->dir, "open-2"));
+    munit_assert_true(DirHasFile(f->dir, "open-3"));
+    return MUNIT_OK;
+}
+
+/* Once the first segment fills up, it gets finalized, and an additional one
+ * gets prepared, to maintain the available segments pool size. */
+TEST(append, finalizeSegment, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
+    APPEND(1, 64);
+    while (!DirHasFile(f->dir, "open-4")) {
+        LOOP_RUN(1);
+    }
+    munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000004"));
+    munit_assert_false(DirHasFile(f->dir, "open-1"));
+    munit_assert_true(DirHasFile(f->dir, "open-4"));
+    return MUNIT_OK;
+}
+
+/* The very first batch of entries to append is bigger than the regular open
+ * segment size. */
+TEST(append, firstBig, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
+    ASSERT_ENTRIES(MAX_SEGMENT_BLOCKS, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE);
+    return MUNIT_OK;
+}
+
+/* The second batch of entries to append is bigger than the regular open
+ * segment size. */
+TEST(append, secondBig, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1, 64);
+    APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
+    return MUNIT_OK;
+}
+
+/* Schedule multiple appends each one exceeding the segment size. */
+TEST(append, severalBig, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_SUBMIT(0, 2, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE);
+    APPEND_SUBMIT(1, 2, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE);
+    APPEND_SUBMIT(2, 2, MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE);
+    APPEND_WAIT(0);
+    APPEND_WAIT(1);
+    APPEND_WAIT(2);
+    ASSERT_ENTRIES(6, 6 * MAX_SEGMENT_BLOCKS * SEGMENT_BLOCK_SIZE);
+    return MUNIT_OK;
+}
+
+/* Write the very first entry and then another one, both fitting in the same
+ * block. */
+TEST(append, fitBlock, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1, 64);
+    APPEND(1, 64);
+    ASSERT_ENTRIES(2, 128);
+    return MUNIT_OK;
+}
+
+/* Write an entry that fills the first block exactly and then another one. */
+TEST(append, matchBlock, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    size_t size;
+
+    size = SEGMENT_BLOCK_SIZE;
+    size -= sizeof(uint64_t) + /* Format */
+            sizeof(uint64_t) + /* Checksums */
+            8 + 16;            /* Header */
+
+    APPEND(1, size);
+    APPEND(1, 64);
+
+    ASSERT_ENTRIES(2, size + 64);
+
+    return MUNIT_OK;
+}
+
+/* Write an entry that exceeds the first block, then another one that fits in
+ * the second block, then a third one that fills the rest of the second block
+ * plus the whole third block exactly, and finally a fourth entry that fits in
+ * the fourth block */
+TEST(append, exceedBlock, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    size_t written;
+    size_t size1;
+    size_t size2;
+
+    size1 = SEGMENT_BLOCK_SIZE;
+
+    APPEND(1, size1);
+    APPEND(1, 64);
+
+    written = sizeof(uint64_t) +     /* Format version */
+              2 * sizeof(uint32_t) + /* CRC sums of first batch */
+              8 + 16 +               /* Header of first batch */
+              size1 +                /* Size of first batch */
+              2 * sizeof(uint32_t) + /* CRC of second batch */
+              8 + 16 +               /* Header of second batch */
+              64;                    /* Size of second batch */
+
+    /* Write a third entry that fills the second block exactly */
+    size2 = SEGMENT_BLOCK_SIZE - (written % SEGMENT_BLOCK_SIZE);
+    size2 -= (2 * sizeof(uint32_t) + 8 + 16);
+    size2 += SEGMENT_BLOCK_SIZE;
+
+    APPEND(1, size2);
+
+    /* Write a fourth entry */
+    APPEND(1, 64);
+
+    ASSERT_ENTRIES(4, size1 + 64 + size2 + 64);
+
+    return MUNIT_OK;
+}
+
+/* If an append request is submitted before the write operation of the previous
+ * append request is started, then a single write will be performed for both
+ * requests. */
+TEST(append, batch, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_SUBMIT(0, 1, 64);
+    APPEND_SUBMIT(1, 1, 64);
+    APPEND_WAIT(0);
+    APPEND_WAIT(1);
+    return MUNIT_OK;
+}
+
+/* An append request submitted while a write operation is in progress gets
+ * executed only when the write completes. */
+TEST(append, wait, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_SUBMIT(0, 1, 64);
+    LOOP_RUN(1);
+    APPEND_SUBMIT(1, 1, 64);
+    APPEND_WAIT(0);
+    APPEND_WAIT(1);
+    return MUNIT_OK;
+}
+
+/* Several batches with different size gets appended in fast pace, forcing the
+ * segment arena to grow. */
+TEST(append, resizeArena, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_SUBMIT(0, 2, 64);
+    APPEND_SUBMIT(1, 1, SEGMENT_BLOCK_SIZE);
+    APPEND_SUBMIT(2, 2, 64);
+    APPEND_SUBMIT(3, 1, SEGMENT_BLOCK_SIZE);
+    APPEND_SUBMIT(4, 1, SEGMENT_BLOCK_SIZE);
+    APPEND_WAIT(0);
+    APPEND_WAIT(1);
+    APPEND_WAIT(2);
+    APPEND_WAIT(3);
+    APPEND_WAIT(4);
+    ASSERT_ENTRIES(7, 64 * 4 + SEGMENT_BLOCK_SIZE * 3);
+    return MUNIT_OK;
+}
+
+/* A few append requests get queued, then a truncate request comes in and other
+ * append requests right after, before truncation is fully completed. */
+TEST(append, truncate, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    int rv;
+
+    return MUNIT_SKIP; /* FIXME: flaky */
+
+    APPEND(2, 64);
+
+    APPEND_SUBMIT(0, 2, 64);
+
+    rv = f->io.truncate(&f->io, 2);
+    munit_assert_int(rv, ==, 0);
+
+    APPEND_SUBMIT(1, 2, 64);
+
+    APPEND_WAIT(0);
+    APPEND_WAIT(1);
+
+    return MUNIT_OK;
+}
+
+/* A few append requests get queued, then a truncate request comes in and other
+ * append requests right after, before truncation is fully completed. However
+ * the backend is closed before the truncation request can be processed. */
+TEST(append, truncateClosing, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    int rv;
+    APPEND(2, 64);
+    APPEND_SUBMIT(0, 2, 64);
+    rv = f->io.truncate(&f->io, 2);
+    munit_assert_int(rv, ==, 0);
+    APPEND_SUBMIT(1, 2, 64);
+    APPEND_EXPECT(1, RAFT_CANCELED);
+    TEAR_DOWN_UV;
+    return MUNIT_OK;
+}
+
+/* A few append requests get queued, however the backend is closed before
+ * preparing the second segment completes. */
+TEST(append, prepareClosing, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_SUBMIT(0, 2, 64);
+    LOOP_RUN(1);
+    TEAR_DOWN_UV;
+    return MUNIT_OK;
+}
+
+/* The counters of the open segments get increased as they are closed. */
+TEST(append, counter, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    size_t size = SEGMENT_BLOCK_SIZE;
+    int i;
+    for (i = 0; i < 10; i++) {
+        APPEND(1, size);
+    }
+    munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000003"));
+    munit_assert_true(DirHasFile(f->dir, "0000000000000004-0000000000000006"));
+    munit_assert_true(DirHasFile(f->dir, "open-4"));
+    return MUNIT_OK;
+}
+
+/* If the I/O instance is closed, all pending append requests get canceled. */
+TEST(append, cancel, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_SUBMIT(0, 1, 64);
+    APPEND_EXPECT(0, RAFT_CANCELED);
+    TEAR_DOWN_UV;
+    return MUNIT_OK;
+}
+
+/* The creation of the current open segment fails because there's no space. */
+TEST(append, noSpaceUponPrepareCurrent, setUp, tearDown, 0, DirTmpfsParams)
+{
+    struct fixture *f = data;
+    SKIP_IF_NO_FIXTURE;
+    raft_uv_set_segment_size(&f->io, SEGMENT_BLOCK_SIZE * 32768);
+    APPEND_FAILURE(
+        1, 64, RAFT_NOSPACE,
+        "create segment open-1: not enough space to allocate 134217728 bytes");
+    return MUNIT_OK;
+}
+
+/* The creation of a spare open segment fails because there's no space. */
+TEST(append, noSpaceUponPrepareSpare, setUp, tearDown, 0, DirTmpfsParams)
+{
+    struct fixture *f = data;
+    SKIP_IF_NO_FIXTURE;
+#if defined(__powerpc64__)
+    /* XXX: fails on ppc64el */
+    return MUNIT_SKIP;
+#endif
+    raft_uv_set_segment_size(&f->io, SEGMENT_BLOCK_SIZE * 2);
+    DirFill(f->dir, SEGMENT_BLOCK_SIZE * 3);
+    APPEND(1, SEGMENT_BLOCK_SIZE);
+    APPEND_SUBMIT(0, 1, SEGMENT_BLOCK_SIZE);
+    APPEND_EXPECT(0, RAFT_NOSPACE);
+    APPEND_WAIT(0);
+    return MUNIT_OK;
+}
+
+/* The write request fails because there's not enough space. */
+TEST(append, noSpaceUponWrite, setUp, tearDownDeps, 0, DirTmpfsParams)
+{
+    struct fixture *f = data;
+    SKIP_IF_NO_FIXTURE;
+#if defined(__powerpc64__)
+    /* XXX: fails on ppc64el */
+    TEAR_DOWN_UV;
+    return MUNIT_SKIP;
+#endif
+    raft_uv_set_segment_size(&f->io, SEGMENT_BLOCK_SIZE);
+    DirFill(f->dir, SEGMENT_BLOCK_SIZE * 2);
+    APPEND(1, 64);
+    APPEND_FAILURE(1, (SEGMENT_BLOCK_SIZE + 128), RAFT_NOSPACE,
+                   "short write: 4096 bytes instead of 8192");
+    DirRemoveFile(f->dir, ".fill");
+    LOOP_RUN(50);
+    APPEND(5, 64);
+    ASSERT_ENTRIES(6, 384);
+    return MUNIT_OK;
+}
+
+/* A few requests fail because not enough disk space is available. Eventually
+ * the space is released and the request succeeds. */
+TEST(append, noSpaceResolved, setUp, tearDownDeps, 0, DirTmpfsParams)
+{
+    struct fixture *f = data;
+    SKIP_IF_NO_FIXTURE;
+#if defined(__powerpc64__)
+    /* XXX: fails on ppc64el */
+    TEAR_DOWN_UV;
+    return MUNIT_SKIP;
+#endif
+    DirFill(f->dir, SEGMENT_BLOCK_SIZE);
+    APPEND_FAILURE(
+        1, 64, RAFT_NOSPACE,
+        "create segment open-1: not enough space to allocate 16384 bytes");
+    APPEND_FAILURE(
+        1, 64, RAFT_NOSPACE,
+        "create segment open-2: not enough space to allocate 16384 bytes");
+    DirRemoveFile(f->dir, ".fill");
+    f->count = 0; /* Reset the data counter */
+    APPEND(1, 64);
+    ASSERT_ENTRIES(1, 64);
+    return MUNIT_OK;
+}
+
+/* An error occurs while performing a write. */
+TEST(append, writeError, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    aio_context_t ctx = 0;
+
+    /* FIXME: doesn't fail anymore after
+     * https://github.com/CanonicalLtd/raft/pull/49 */
+    return MUNIT_SKIP;
+
+    APPEND_SUBMIT(0, 1, 64);
+    AioFill(&ctx, 0);
+    APPEND_WAIT(0);
+    AioDestroy(ctx);
+    return MUNIT_OK;
+}
+
+static char *oomHeapFaultDelay[] = {"1", /* FIXME "2", */ NULL};
+static char *oomHeapFaultRepeat[] = {"1", NULL};
+
+static MunitParameterEnum oomParams[] = {
+    {TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay},
+    {TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat},
+    {NULL, NULL},
+};
+
+/* Out of memory conditions. */
+TEST(append, oom, setUp, tearDown, 0, oomParams)
+{
+    struct fixture *f = data;
+    HEAP_FAULT_ENABLE;
+    APPEND_ERROR(1, 64, RAFT_NOMEM, "");
+    return MUNIT_OK;
+}
+
+/* The uv instance is closed while a write request is in progress. */
+TEST(append, closeDuringWrite, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    /* TODO: broken */
+    return MUNIT_SKIP;
+
+    APPEND_SUBMIT(0, 1, 64);
+    LOOP_RUN(1);
+    TEAR_DOWN_UV;
+
+    return MUNIT_OK;
+}
+
+/* When the backend is closed, all unused open segments get removed. */
+TEST(append, removeSegmentUponClose, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1, 64);
+    while (!DirHasFile(f->dir, "open-2")) {
+        LOOP_RUN(1);
+    }
+    TEAR_DOWN_UV;
+    munit_assert_false(DirHasFile(f->dir, "open-2"));
+    return MUNIT_OK;
+}
+
+/* When the backend is closed, all pending prepare get requests get canceled. */
+TEST(append, cancelPrepareRequest, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    /* TODO: find a way to test a prepare request cancelation */
+    return MUNIT_SKIP;
+    APPEND(MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
+    APPEND_SUBMIT(0, 1, 64);
+    APPEND_EXPECT(0, RAFT_CANCELED);
+    TEAR_DOWN_UV;
+    return MUNIT_OK;
+}
+
+/* When the writer gets closed it tells the writer to close the segment that
+ * it's currently writing. */
+TEST(append, currentSegment, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+
+    APPEND(1, 64);
+
+    TEAR_DOWN_UV;
+
+    munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000001"));
+
+    return MUNIT_OK;
+}
+
+/* The kernel has ran out of available AIO events. */
+TEST(append, ioSetupError, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    aio_context_t ctx = 0;
+    int rv;
+    rv = AioFill(&ctx, 0);
+    if (rv != 0) {
+        return MUNIT_SKIP;
+    }
+    APPEND_FAILURE(1, 64, RAFT_TOOMANY,
+                   "setup writer for open-1: AIO events user limit exceeded");
+    return MUNIT_OK;
+}
+
+/*===========================================================================
+  Test interaction between UvAppend and UvBarrier
+  ===========================================================================*/
+
+struct barrierData
+{
+    int current;     /* Count the number of finished AppendEntries RPCs  */
+    int expected;    /* Expected number of finished AppendEntries RPCs   */
+    bool done;       /* @true if the Barrier CB has fired                */
+    bool expectDone; /* Expect the Barrier CB to have fired or not       */
+    char **files;    /* Expected files in the directory, NULL terminated */
+    struct uv *uv;
+};
+
+static void barrierCbCompareCounter(struct UvBarrierReq *barrier)
+{
+    struct barrierData *bd = barrier->data;
+    munit_assert_false(bd->done);
+    bd->done = true;
+    struct uv *uv = bd->uv;
+    UvUnblock(uv);
+    munit_assert_int(bd->current, ==, bd->expected);
+    if (bd->files != NULL) {
+        int i = 0;
+        while (bd->files[i] != NULL) {
+            munit_assert_true(DirHasFile(uv->dir, bd->files[i]));
+            ++i;
+        }
+    }
+}
+
+static void barrierDoneCb(struct UvBarrierReq *barrier)
+{
+    struct barrierData *bd = barrier->data;
+    munit_assert_false(bd->done);
+    bd->done = true;
+}
+
+static void appendCbIncreaseCounterAssertResult(struct raft_io_append *req,
+                                                int status)
+{
+    struct result *result = req->data;
+    munit_assert_int(status, ==, result->status);
+    result->done = true;
+    struct barrierData *bd = result->data;
+    munit_assert_true(bd->done == bd->expectDone);
+    bd->current += 1;
+}
+
+static void appendDummyCb(struct raft_io_append *req, int status)
+{
+    (void)req;
+    (void)status;
+}
+
+static char *bools[] = {"0", "1", NULL};
+static MunitParameterEnum blocking_bool_params[] = {
+    {"bool", bools},
+    {NULL, NULL},
+};
+
+/* Fill up 3 segments worth of AppendEntries RPC's.
+ * Request a Barrier and expect that the AppendEntries RPC's are finished before
+ * the Barrier callback is fired.
+ */
+TEST(append, barrierOpenSegments, setUp, tearDown, 0, blocking_bool_params)
+{
+    struct fixture *f = data;
+    struct barrierData bd = {0};
+    bd.current = 0;
+    bd.expected = 3;
+    bd.done = false;
+    bd.expectDone = false;
+    bd.uv = f->io.impl;
+    char *files[] = {"0000000000000001-0000000000000004",
+                     "0000000000000005-0000000000000008",
+                     "0000000000000009-0000000000000012", NULL};
+    bd.files = files;
+
+    APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendCbIncreaseCounterAssertResult, &bd, 0);
+    APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendCbIncreaseCounterAssertResult, &bd, 0);
+    APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendCbIncreaseCounterAssertResult, &bd, 0);
+
+    struct UvBarrierReq barrier = {0};
+    barrier.data = (void *)&bd;
+    barrier.blocking =
+        (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0);
+    barrier.cb = barrierCbCompareCounter;
+    UvBarrier(f->io.impl, 1, &barrier);
+
+    /* Make sure every callback fired */
+    LOOP_RUN_UNTIL(&bd.done);
+    APPEND_WAIT(0);
+    APPEND_WAIT(1);
+    APPEND_WAIT(2);
+    return MUNIT_OK;
+}
+
+/* Fill up 3 segments worth of AppendEntries RPC's.
+ * Request a Barrier and stop early.
+ */
+TEST(append, barrierOpenSegmentsExitEarly, setUp, NULL, 0, blocking_bool_params)
+{
+    struct fixture *f = data;
+    struct barrierData bd = {0};
+    bd.current = 0;
+    bd.expected = 3;
+    bd.done = false;
+    bd.expectDone = false;
+    bd.uv = f->io.impl;
+    char *files[] = {"0000000000000001-0000000000000004",
+                     "0000000000000005-0000000000000008",
+                     "0000000000000009-0000000000000012", NULL};
+    bd.files = files;
+
+    APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendDummyCb, NULL, 0);
+    APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendDummyCb, NULL, 0);
+    APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendDummyCb, NULL, 0);
+
+    struct UvBarrierReq barrier = {0};
+    barrier.data = (void *)&bd;
+    barrier.blocking =
+        (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0);
+    barrier.cb = barrierDoneCb;
+    UvBarrier(f->io.impl, 1, &barrier);
+
+    /* Exit early. */
+    tearDown(data);
+    munit_assert_true(bd.done);
+
+    return MUNIT_OK;
+}
+
+/* Fill up 3 segments worth of AppendEntries RPC's.
+ * Request a 2 barriers and expect their callbacks to fire.
+ */
+TEST(append, twoBarriersOpenSegments, setUp, tearDown, 0, blocking_bool_params)
+{
+    struct fixture *f = data;
+    struct barrierData bd1 = {0};
+    bd1.current = 0;
+    bd1.expected = 3;
+    bd1.done = false;
+    bd1.expectDone = false;
+    bd1.uv = f->io.impl;
+    char *files[] = {"0000000000000001-0000000000000004",
+                     "0000000000000005-0000000000000008",
+                     "0000000000000009-0000000000000012", NULL};
+    bd1.files = files;
+    /* Only expect the callback to eventually fire. */
+    struct barrierData bd2 = {0};
+    bd2.uv = f->io.impl;
+
+    APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendCbIncreaseCounterAssertResult, &bd1, 0);
+    APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendCbIncreaseCounterAssertResult, &bd1, 0);
+    APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendCbIncreaseCounterAssertResult, &bd1, 0);
+
+    struct UvBarrierReq barrier1 = {0};
+    barrier1.data = (void *)&bd1;
+    barrier1.blocking =
+        (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0);
+    barrier1.cb = barrierCbCompareCounter;
+    UvBarrier(f->io.impl, 1, &barrier1);
+    struct UvBarrierReq barrier2 = {0};
+    barrier2.data = (void *)&bd2;
+    barrier2.blocking =
+        (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0);
+    barrier2.cb = barrierCbCompareCounter;
+    UvBarrier(f->io.impl, 1, &barrier2);
+
+    /* Make sure every callback fired */
+    LOOP_RUN_UNTIL(&bd1.done);
+    LOOP_RUN_UNTIL(&bd2.done);
+    APPEND_WAIT(0);
+    APPEND_WAIT(1);
+    APPEND_WAIT(2);
+    return MUNIT_OK;
+}
+
+/* Fill up 3 segments worth of AppendEntries RPC's.
+ * Request 2 barriers and exit early.
+ */
+TEST(append, twoBarriersExitEarly, setUp, NULL, 0, blocking_bool_params)
+{
+    struct fixture *f = data;
+    struct barrierData bd1 = {0};
+    bd1.current = 0;
+    bd1.expected = 3;
+    bd1.done = false;
+    bd1.expectDone = false;
+    bd1.uv = f->io.impl;
+    char *files[] = {"0000000000000001-0000000000000004",
+                     "0000000000000005-0000000000000008",
+                     "0000000000000009-0000000000000012", NULL};
+    bd1.files = files;
+    /* Only expect the callback to eventually fire. */
+    struct barrierData bd2 = {0};
+    bd2.uv = f->io.impl;
+
+    APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendDummyCb, NULL, 0);
+    APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendDummyCb, NULL, 0);
+    APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendDummyCb, NULL, 0);
+
+    struct UvBarrierReq barrier1 = {0};
+    barrier1.data = (void *)&bd1;
+    barrier1.blocking =
+        (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0);
+    barrier1.cb = barrierDoneCb;
+    UvBarrier(f->io.impl, 1, &barrier1);
+    struct UvBarrierReq barrier2 = {0};
+    barrier2.data = (void *)&bd2;
+    barrier2.blocking =
+        (bool)strtoul(munit_parameters_get(params, "bool"), NULL, 0);
+    barrier2.cb = barrierDoneCb;
+    UvBarrier(f->io.impl, 1, &barrier2);
+
+    /* Exit early. */
+    tearDown(data);
+    munit_assert_true(bd1.done);
+    munit_assert_true(bd2.done);
+
+    return MUNIT_OK;
+}
+
+/* Request a blocking Barrier and expect that the no AppendEntries RPC's are
+ * finished before the Barrier callback is fired.
+ */
+TEST(append, blockingBarrierNoOpenSegments, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct barrierData bd = {0};
+    bd.current = 0;
+    bd.expected = 0;
+    bd.done = false;
+    bd.expectDone = true;
+    bd.uv = f->io.impl;
+
+    struct UvBarrierReq barrier = {0};
+    barrier.data = (void *)&bd;
+    barrier.blocking = true;
+    barrier.cb = barrierCbCompareCounter;
+    UvBarrier(f->io.impl, 1, &barrier);
+
+    APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendCbIncreaseCounterAssertResult, &bd, 0);
+    APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendCbIncreaseCounterAssertResult, &bd, 0);
+    APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendCbIncreaseCounterAssertResult, &bd, 0);
+
+    /* Make sure every callback fired */
+    LOOP_RUN_UNTIL(&bd.done);
+    APPEND_WAIT(0);
+    APPEND_WAIT(1);
+    APPEND_WAIT(2);
+    return MUNIT_OK;
+}
+
+/* Request a blocking Barrier and expect that the no AppendEntries RPC's are
+ * finished before the Barrier callback is fired. */
+TEST(append, blockingBarrierSingleOpenSegment, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct barrierData bd = {0};
+    bd.current = 0;
+    bd.expected = 0;
+    bd.done = false;
+    bd.expectDone = true;
+    bd.uv = f->io.impl;
+    char *files[] = {"0000000000000001-0000000000000001", NULL};
+    bd.files = files;
+
+    /* Wait until there is at least 1 open segment otherwise
+     * the barrier Cb is fired immediately. */
+    APPEND(1, 64);
+    while (!DirHasFile(f->dir, "open-1")) {
+        LOOP_RUN(1);
+    }
+
+    struct UvBarrierReq barrier = {0};
+    barrier.data = (void *)&bd;
+    barrier.blocking = true;
+    barrier.cb = barrierCbCompareCounter;
+    UvBarrier(f->io.impl, 1, &barrier);
+
+    APPEND_SUBMIT_CB_DATA(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendCbIncreaseCounterAssertResult, &bd, 0);
+    APPEND_SUBMIT_CB_DATA(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendCbIncreaseCounterAssertResult, &bd, 0);
+    APPEND_SUBMIT_CB_DATA(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE,
+                          appendCbIncreaseCounterAssertResult, &bd, 0);
+
+    /* Make sure every callback fired */
+    LOOP_RUN_UNTIL(&bd.done);
+    APPEND_WAIT(0);
+    APPEND_WAIT(1);
+    APPEND_WAIT(2);
+    return MUNIT_OK;
+}
+
+static void longWorkCb(uv_work_t *work)
+{
+    (void)work;
+    sleep(1);
+}
+
+static void longAfterWorkCb(uv_work_t *work, int status)
+{
+    struct barrierData *bd = work->data;
+    munit_assert_false(bd->done);
+    bd->done = true;
+    munit_assert_int(status, ==, 0);
+    struct uv *uv = bd->uv;
+    UvUnblock(uv);
+    munit_assert_int(bd->current, ==, bd->expected);
+    free(work);
+}
+
+static void barrierCbLongWork(struct UvBarrierReq *barrier)
+{
+    struct barrierData *bd = barrier->data;
+    munit_assert_false(bd->done);
+    struct uv *uv = bd->uv;
+    int rv;
+
+    uv_work_t *work = munit_malloc(sizeof(*work));
+    munit_assert_ptr_not_null(work);
+    work->data = bd;
+
+    rv = uv_queue_work(uv->loop, work, longWorkCb, longAfterWorkCb);
+    munit_assert_int(rv, ==, 0);
+}
+
+/* Request a non-blocking Barrier that triggers a long-running task, the barrier
+ * is removed when the long running task completes. This simulates a large
+ * snapshot write. Ensure Append requests complete before the long running task
+ * completes.*/
+TEST(append, nonBlockingBarrierLongBlockingTask, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct barrierData bd = {0};
+    bd.current = 0;
+    bd.expected = 1;
+    bd.done = false;
+    bd.expectDone = false;
+    bd.uv = f->io.impl;
+
+    struct UvBarrierReq barrier = {0};
+    barrier.data = (void *)&bd;
+    barrier.blocking = false;
+    barrier.cb = barrierCbLongWork;
+    UvBarrier(f->io.impl, bd.uv->append_next_index, &barrier);
+    APPEND_SUBMIT_CB_DATA(0, 1, 64, appendCbIncreaseCounterAssertResult, &bd,
+                          0);
+
+    /* Make sure every callback fired */
+    LOOP_RUN_UNTIL(&bd.done);
+    APPEND_WAIT(0);
+    return MUNIT_OK;
+}
+
+/* Request a blocking Barrier that triggers a long-running task, the barrier
+ * is unblocked and removed when the long running task completes. This simulates
+ * a large snapshot install. Ensure Append requests complete after the work
+ * completes.*/
+TEST(append, blockingBarrierLongBlockingTask, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct barrierData bd = {0};
+    bd.current = 0;
+    bd.expected = 0;
+    bd.done = false;
+    bd.expectDone = true;
+    bd.uv = f->io.impl;
+
+    struct UvBarrierReq barrier = {0};
+    barrier.data = (void *)&bd;
+    barrier.blocking = true;
+    barrier.cb = barrierCbLongWork;
+    UvBarrier(f->io.impl, bd.uv->append_next_index, &barrier);
+    APPEND_SUBMIT_CB_DATA(0, 1, 64, appendCbIncreaseCounterAssertResult, &bd,
+                          0);
+
+    /* Make sure every callback fired */
+    LOOP_RUN_UNTIL(&bd.done);
+    APPEND_WAIT(0);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_uv_bootstrap.c b/test/raft/integration/test_uv_bootstrap.c
new file mode 100644
index 000000000..e987f15cb
--- /dev/null
+++ b/test/raft/integration/test_uv_bootstrap.c
@@ -0,0 +1,98 @@
+#include "../lib/runner.h"
+#include "../lib/uv.h"
+
+/******************************************************************************
+ *
+ * Fixture with a libuv-based raft_io instance and an empty configuration.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_UV_DEPS;
+    FIXTURE_UV;
+    struct raft_configuration conf;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+/* Add a server to the fixture's configuration. */
+#define CONFIGURATION_ADD(ID, ADDRESS)                                   \
+    {                                                                    \
+        int rv_;                                                         \
+        rv_ = raft_configuration_add(&f->conf, ID, ADDRESS, RAFT_VOTER); \
+        munit_assert_int(rv_, ==, 0);                                    \
+    }
+
+/* Invoke f->io->bootstrap() and assert that no error occurs. */
+#define BOOTSTRAP                                \
+    {                                            \
+        int rv_;                                 \
+        rv_ = f->io.bootstrap(&f->io, &f->conf); \
+        munit_assert_int(rv_, ==, 0);            \
+    }
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_UV_DEPS;
+    SETUP_UV;
+    raft_configuration_init(&f->conf);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    raft_configuration_close(&f->conf);
+    TEAR_DOWN_UV;
+    TEAR_DOWN_UV_DEPS;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * raft_io->bootstrap()
+ *
+ *****************************************************************************/
+
+SUITE(bootstrap)
+
+/* Invoke f->io->bootstrap() and assert that it returns the given error code and
+ * message. */
+#define BOOTSTRAP_ERROR(RV, ERRMSG)                      \
+    {                                                    \
+        int rv_;                                         \
+        rv_ = f->io.bootstrap(&f->io, &f->conf);         \
+        munit_assert_int(rv_, ==, RV);                   \
+        munit_assert_string_equal(f->io.errmsg, ERRMSG); \
+    }
+
+/* Bootstrap a pristine server. */
+TEST(bootstrap, pristine, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CONFIGURATION_ADD(1, "1");
+    BOOTSTRAP;
+    return MUNIT_OK;
+}
+
+/* The data directory already has metadata files with a non-zero term. */
+TEST(bootstrap, termIsNonZero, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CONFIGURATION_ADD(1, "1");
+    BOOTSTRAP;
+    BOOTSTRAP_ERROR(RAFT_CANTBOOTSTRAP, "metadata contains term 1");
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_uv_init.c b/test/raft/integration/test_uv_init.c
new file mode 100644
index 000000000..d4358c689
--- /dev/null
+++ b/test/raft/integration/test_uv_init.c
@@ -0,0 +1,268 @@
+#include "../../../src/raft.h"
+#include "../../../src/raft/byte.h"
+#include "../lib/runner.h"
+#include "../lib/uv.h"
+
+#include <linux/magic.h>
+#include <sys/vfs.h>
+
+/******************************************************************************
+ *
+ * Fixture with a non-initialized raft_io instance and uv dependencies.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_UV_DEPS;
+    FIXTURE_UV;
+    bool closed;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+static void closeCb(struct raft_io *io)
+{
+    struct fixture *f = io->data;
+    f->closed = true;
+}
+
+/* Invoke raft_uv_init() and assert that no error occurs. */
+#define INIT(DIR)                                                 \
+    do {                                                          \
+        int _rv;                                                  \
+        _rv = raft_uv_init(&f->io, &f->loop, DIR, &f->transport); \
+        munit_assert_int(_rv, ==, 0);                             \
+        _rv = f->io.init(&f->io, 1, "1");                         \
+        munit_assert_int(_rv, ==, 0);                             \
+    } while (0)
+
+/* Invoke raft_io->close(). */
+#define CLOSE                         \
+    do {                              \
+        f->io.close(&f->io, closeCb); \
+        LOOP_RUN_UNTIL(&f->closed);   \
+        raft_uv_close(&f->io);        \
+    } while (0)
+
+/* Invoke raft_uv_init() and assert that the given error code is returned and
+ * the given error message set. */
+#define INIT_ERROR(DIR, RV, ERRMSG)                               \
+    do {                                                          \
+        int _rv;                                                  \
+        _rv = raft_uv_init(&f->io, &f->loop, DIR, &f->transport); \
+        munit_assert_int(_rv, ==, 0);                             \
+        _rv = f->io.init(&f->io, 1, "1");                         \
+        munit_assert_int(_rv, ==, RV);                            \
+        munit_assert_string_equal(f->io.errmsg, ERRMSG);          \
+        CLOSE;                                                    \
+    } while (0)
+
+/* Write either the metadata1 or metadata2 file, filling it with the given
+ * values. */
+#define WRITE_METADATA_FILE(N, FORMAT, VERSION, TERM, VOTED_FOR) \
+    {                                                            \
+        uint8_t buf[8 * 4];                                      \
+        void *cursor = buf;                                      \
+        char filename[strlen("metadataN") + 1];                  \
+        sprintf(filename, "metadata%d", N);                      \
+        bytePut64(&cursor, FORMAT);                              \
+        bytePut64(&cursor, VERSION);                             \
+        bytePut64(&cursor, TERM);                                \
+        bytePut64(&cursor, VOTED_FOR);                           \
+        DirWriteFile(f->dir, filename, buf, sizeof buf);         \
+    }
+
+#define LONG_DIR                                                               \
+    "/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" \
+    "/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" \
+    "/ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc" \
+    "/ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd" \
+    "/eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee" \
+    "/fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" \
+    "/ggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg" \
+    "/hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh" \
+    "/iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii" \
+    "/jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj" \
+    "/kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk" \
+    "/lllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllll" \
+    "/mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm"
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_UV_DEPS;
+    f->io.data = f;
+    f->closed = false;
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    if (f == NULL) {
+        return;
+    }
+    TEAR_DOWN_UV_DEPS;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * raft_io->init()
+ *
+ *****************************************************************************/
+
+SUITE(init)
+
+TEST(init, dirTooLong, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_io io = {0};
+    int rv;
+    rv = raft_uv_init(&io, &f->loop, LONG_DIR, &f->transport);
+    munit_assert_int(rv, ==, RAFT_NAMETOOLONG);
+    munit_assert_string_equal(io.errmsg, "directory path too long");
+    return 0;
+}
+
+/* Out of memory conditions upon probing for direct I/O. */
+TEST(init, probeDirectIoOom, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    /* XXX: tmpfs seems to not support O_DIRECT */
+    struct statfs info;
+    int rv;
+    rv = statfs(f->dir, &info);
+    munit_assert_int(rv, ==, 0);
+    if (info.f_type == TMPFS_MAGIC) {
+        return MUNIT_SKIP;
+    }
+#if defined(__powerpc64__)
+    /* XXX: fails on ppc64el */
+    return MUNIT_SKIP;
+#endif
+    HeapFaultConfig(&f->heap, 1 /* delay */, 1 /* repeat */);
+    HEAP_FAULT_ENABLE;
+    INIT_ERROR(f->dir, RAFT_NOMEM, "probe Direct I/O: out of memory");
+    return 0;
+}
+
+/* Out of memory conditions upon probing for async I/O. */
+TEST(init, probeAsyncIoOom, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    /* XXX: tmpfs seems to not support O_DIRECT */
+    struct statfs info;
+    int rv;
+    rv = statfs(f->dir, &info);
+    munit_assert_int(rv, ==, 0);
+    if (info.f_type == TMPFS_MAGIC) {
+        return MUNIT_SKIP;
+    }
+#if defined(__powerpc64__)
+    /* XXX: fails on ppc64el */
+    return MUNIT_SKIP;
+#endif
+    HeapFaultConfig(&f->heap, 2 /* delay */, 1 /* repeat */);
+    HEAP_FAULT_ENABLE;
+    INIT_ERROR(f->dir, RAFT_NOMEM, "probe Async I/O: out of memory");
+    return 0;
+}
+
+/* The given directory does not exist. */
+TEST(init, dirDoesNotExist, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    INIT_ERROR("/foo/bar/egg/baz", RAFT_NOTFOUND,
+               "directory '/foo/bar/egg/baz' does not exist");
+    return MUNIT_OK;
+}
+
+/* The given directory not accessible */
+TEST(init, dirNotAccessible, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    sprintf(errmsg, "directory '%s' is not writable", f->dir);
+    DirMakeUnexecutable(f->dir);
+    INIT_ERROR(f->dir, RAFT_INVALID, errmsg);
+    return MUNIT_OK;
+}
+
+/* No space is left for probing I/O capabilities. */
+TEST(init, noSpace, setUp, tearDown, 0, DirTmpfsParams)
+{
+    struct fixture *f = data;
+    SKIP_IF_NO_FIXTURE;
+    DirFill(f->dir, 4);
+    INIT_ERROR(f->dir, RAFT_NOSPACE,
+               "create I/O capabilities probe file: not enough space to "
+               "allocate 4096 bytes");
+    return MUNIT_OK;
+}
+
+/* The metadata1 file has not the expected number of bytes. In this case the
+ * file is not considered at all, and the effect is as if this was a brand new
+ * server. */
+TEST(init, metadataOneTooShort, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t buf[16] = {0};
+    DirWriteFile(f->dir, "metadata1", buf, sizeof buf);
+    INIT(f->dir);
+    CLOSE;
+    return MUNIT_OK;
+}
+
+/* The metadata1 file has not the expected format. */
+TEST(init, metadataOneBadFormat, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    WRITE_METADATA_FILE(1, /* Metadata file index                  */
+                        2, /* Format                               */
+                        1, /* Version                              */
+                        1, /* Term                                 */
+                        0 /* Voted for                            */);
+    INIT_ERROR(f->dir, RAFT_MALFORMED,
+               "decode content of metadata1: bad format version 2");
+    return MUNIT_OK;
+}
+
+/* The metadata1 file has not a valid version. */
+TEST(init, metadataOneBadVersion, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    WRITE_METADATA_FILE(1, /* Metadata file index                  */
+                        1, /* Format                               */
+                        0, /* Version                              */
+                        1, /* Term                                 */
+                        0 /* Voted for                            */);
+    INIT_ERROR(f->dir, RAFT_CORRUPT,
+               "decode content of metadata1: version is set to zero");
+    return MUNIT_OK;
+}
+
+/* The data directory has both metadata files, but they have the same
+ * version. */
+TEST(init, metadataOneAndTwoSameVersion, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    WRITE_METADATA_FILE(1, /* Metadata file index                  */
+                        1, /* Format                               */
+                        2, /* Version                              */
+                        3, /* Term                                 */
+                        0 /* Voted for                            */);
+    WRITE_METADATA_FILE(2, /* Metadata file index                  */
+                        1, /* Format                               */
+                        2, /* Version                              */
+                        2, /* Term                                 */
+                        0 /* Voted for                            */);
+    INIT_ERROR(f->dir, RAFT_CORRUPT,
+               "metadata1 and metadata2 are both at version 2");
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_uv_load.c b/test/raft/integration/test_uv_load.c
new file mode 100644
index 000000000..5ef1e339a
--- /dev/null
+++ b/test/raft/integration/test_uv_load.c
@@ -0,0 +1,1772 @@
+#include <unistd.h>
+
+#include "../../../src/raft/byte.h"
+#include "../../../src/raft/uv.h"
+#include "../lib/runner.h"
+#include "../lib/uv.h"
+
+/******************************************************************************
+ *
+ * Fixture with a non-initialized libuv-based raft_io instance.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_UV_DEPS;
+    FIXTURE_UV;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+static void closeCb(struct raft_io *io)
+{
+    bool *done = io->data;
+    *done = true;
+}
+
+static void appendCb(struct raft_io_append *req, int status)
+{
+    bool *done = req->data;
+    munit_assert_int(status, ==, 0);
+    *done = true;
+}
+
+static void snapshotPutCb(struct raft_io_snapshot_put *req, int status)
+{
+    bool *done = req->data;
+    munit_assert_int(status, ==, 0);
+    *done = true;
+}
+
+struct snapshot
+{
+    raft_term term;
+    raft_index index;
+    uint64_t data;
+};
+
+#define WORD_SIZE 8
+
+/* Maximum number of blocks a segment can have */
+#define MAX_SEGMENT_BLOCKS 4
+
+/* This block size should work fine for all file systems. */
+#define SEGMENT_BLOCK_SIZE 4096
+
+/* Desired segment size */
+#define SEGMENT_SIZE SEGMENT_BLOCK_SIZE *MAX_SEGMENT_BLOCKS
+
+#define CLOSED_SEGMENT_FILENAME(START, END) \
+    "000000000000000" #START                \
+    "-"                                     \
+    "000000000000000" #END
+
+/* Check if open segment file exists. */
+#define HAS_OPEN_SEGMENT_FILE(COUNT) DirHasFile(f->dir, "open-" #COUNT)
+
+/* Check if closed segment file exists. */
+#define HAS_CLOSED_SEGMENT_FILE(START, END) \
+    DirHasFile(f->dir, CLOSED_SEGMENT_FILENAME(START, END))
+
+/* Initialize a standalone raft_io instance and use it to append N batches of
+ * entries, each containing one entry. DATA should be an integer that will be
+ * used as base value for the data of the first entry, and will be then
+ * incremented for subsequent entries. */
+#define APPEND(N, DATA)                                                      \
+    do {                                                                     \
+        struct raft_uv_transport _transport;                                 \
+        struct raft_io _io;                                                  \
+        raft_term _term;                                                     \
+        raft_id _voted_for;                                                  \
+        struct raft_snapshot *_snapshot;                                     \
+        raft_index _start_index;                                             \
+        struct raft_entry *_entries;                                         \
+        size_t _i;                                                           \
+        size_t _n;                                                           \
+        void *_batch = NULL;                                                 \
+        struct raft_entry _new_entry;                                        \
+        uint64_t _new_entry_data;                                            \
+        uint64_t _data = DATA;                                               \
+        struct raft_io_append _req;                                          \
+        bool _done = false;                                                  \
+        int _rv;                                                             \
+                                                                             \
+        /* Initialize the instance, loading existing data, but discarding    \
+         * it. This makes sure that the start index is correctly set. */     \
+        _transport.version = 1;                                              \
+        _rv = raft_uv_tcp_init(&_transport, &f->loop);                       \
+        munit_assert_int(_rv, ==, 0);                                        \
+        _rv = raft_uv_init(&_io, &f->loop, f->dir, &_transport);             \
+        munit_assert_int(_rv, ==, 0);                                        \
+        _rv = _io.init(&_io, 1, "1");                                        \
+        munit_assert_int(_rv, ==, 0);                                        \
+        raft_uv_set_block_size(&_io, SEGMENT_BLOCK_SIZE);                    \
+        raft_uv_set_segment_size(&_io, SEGMENT_SIZE);                        \
+        _rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \
+                       &_entries, &_n);                                      \
+        munit_assert_int(_rv, ==, 0);                                        \
+        for (_i = 0; _i < _n; _i++) {                                        \
+            struct raft_entry *_entry = &_entries[_i];                       \
+            if (_entry->batch != _batch) {                                   \
+                _batch = _entry->batch;                                      \
+                raft_free(_batch);                                           \
+            }                                                                \
+        }                                                                    \
+        if (_entries != NULL) {                                              \
+            raft_free(_entries);                                             \
+        }                                                                    \
+        if (_snapshot != NULL) {                                             \
+            raft_configuration_close(&_snapshot->configuration);             \
+            munit_assert_int(_snapshot->n_bufs, ==, 1);                      \
+            raft_free(_snapshot->bufs[0].base);                              \
+            raft_free(_snapshot->bufs);                                      \
+            raft_free(_snapshot);                                            \
+        }                                                                    \
+                                                                             \
+        /* Append the new entries. */                                        \
+        for (_i = 0; _i < N; _i++) {                                         \
+            struct raft_entry *entry = &_new_entry;                          \
+            entry->term = 1;                                                 \
+            entry->type = RAFT_COMMAND;                                      \
+            entry->buf.base = &_new_entry_data;                              \
+            entry->buf.len = sizeof _new_entry_data;                         \
+            entry->batch = NULL;                                             \
+            munit_assert_ptr_not_null(entry->buf.base);                      \
+            memset(entry->buf.base, 0, entry->buf.len);                      \
+            *(uint64_t *)entry->buf.base = _data;                            \
+            _data++;                                                         \
+            _req.data = &_done;                                              \
+            _rv = _io.append(&_io, &_req, entry, 1, appendCb);               \
+            munit_assert_int(_rv, ==, 0);                                    \
+            LOOP_RUN_UNTIL(&_done);                                          \
+            _done = false;                                                   \
+        }                                                                    \
+                                                                             \
+        /* Shutdown the standalone raft_io instance. */                      \
+        _done = false;                                                       \
+        _io.data = &_done;                                                   \
+        _io.close(&_io, closeCb);                                            \
+        LOOP_RUN_UNTIL(&_done);                                              \
+        raft_uv_close(&_io);                                                 \
+        raft_uv_tcp_close(&_transport);                                      \
+    } while (0);
+
+/* Initialize a standalone raft_io instance and use it to persist a new snapshot
+ * at the given INDEX and TERM. DATA should be an integer that will be used as
+ * as snapshot content. */
+#define SNAPSHOT_PUT(TERM, INDEX, DATA)                                       \
+    do {                                                                      \
+        struct raft_uv_transport _transport;                                  \
+        struct raft_io _io;                                                   \
+        raft_term _term;                                                      \
+        raft_id _voted_for;                                                   \
+        struct raft_snapshot *_snapshot;                                      \
+        raft_index _start_index;                                              \
+        struct raft_entry *_entries;                                          \
+        size_t _i;                                                            \
+        size_t _n;                                                            \
+        void *_batch = NULL;                                                  \
+        struct raft_snapshot _new_snapshot;                                   \
+        struct raft_buffer _new_snapshot_buf;                                 \
+        uint64_t _new_snapshot_data = DATA;                                   \
+        struct raft_io_snapshot_put _req;                                     \
+        bool _done = false;                                                   \
+        int _rv;                                                              \
+                                                                              \
+        /* Initialize the instance, loading existing data, but discarding     \
+         * it. This makes sure that the start index is correctly set. */      \
+        _transport.version = 1;                                               \
+        _rv = raft_uv_tcp_init(&_transport, &f->loop);                        \
+        munit_assert_int(_rv, ==, 0);                                         \
+        _rv = raft_uv_init(&_io, &f->loop, f->dir, &_transport);              \
+        munit_assert_int(_rv, ==, 0);                                         \
+        _rv = _io.init(&_io, 1, "1");                                         \
+        munit_assert_int(_rv, ==, 0);                                         \
+        raft_uv_set_block_size(&_io, SEGMENT_BLOCK_SIZE);                     \
+        raft_uv_set_segment_size(&_io, SEGMENT_SIZE);                         \
+        _rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index,  \
+                       &_entries, &_n);                                       \
+        munit_assert_int(_rv, ==, 0);                                         \
+        for (_i = 0; _i < _n; _i++) {                                         \
+            struct raft_entry *_entry = &_entries[_i];                        \
+            if (_entry->batch != _batch) {                                    \
+                _batch = _entry->batch;                                       \
+                raft_free(_batch);                                            \
+            }                                                                 \
+        }                                                                     \
+        if (_entries != NULL) {                                               \
+            raft_free(_entries);                                              \
+        }                                                                     \
+        if (_snapshot != NULL) {                                              \
+            raft_configuration_close(&_snapshot->configuration);              \
+            munit_assert_int(_snapshot->n_bufs, ==, 1);                       \
+            raft_free(_snapshot->bufs[0].base);                               \
+            raft_free(_snapshot->bufs);                                       \
+            raft_free(_snapshot);                                             \
+        }                                                                     \
+                                                                              \
+        /* Persist the new snapshot. */                                       \
+        _new_snapshot.index = INDEX;                                          \
+        _new_snapshot.term = TERM;                                            \
+        raft_configuration_init(&_new_snapshot.configuration);                \
+        _rv = raft_configuration_add(&_new_snapshot.configuration, 1, "1",    \
+                                     RAFT_VOTER);                             \
+        munit_assert_int(_rv, ==, 0);                                         \
+        _new_snapshot.bufs = &_new_snapshot_buf;                              \
+        _new_snapshot.n_bufs = 1;                                             \
+        _new_snapshot_buf.base = &_new_snapshot_data;                         \
+        _new_snapshot_buf.len = sizeof _new_snapshot_data;                    \
+        _req.data = &_done;                                                   \
+        _rv =                                                                 \
+            _io.snapshot_put(&_io, 10, &_req, &_new_snapshot, snapshotPutCb); \
+        munit_assert_int(_rv, ==, 0);                                         \
+        LOOP_RUN_UNTIL(&_done);                                               \
+        raft_configuration_close(&_new_snapshot.configuration);               \
+                                                                              \
+        /* Shutdown the standalone raft_io instance. */                       \
+        _done = false;                                                        \
+        _io.data = &_done;                                                    \
+        _io.close(&_io, closeCb);                                             \
+        LOOP_RUN_UNTIL(&_done);                                               \
+        raft_uv_close(&_io);                                                  \
+        raft_uv_tcp_close(&_transport);                                       \
+    } while (0);
+
+/* Forcibly turn a closed segment into an open one, by renaming the underlying
+ * file and growing its size. */
+#define UNFINALIZE(FIRST_INDEX, LAST_INDEX, COUNTER)          \
+    do {                                                      \
+        const char *_filename1 =                              \
+            CLOSED_SEGMENT_FILENAME(FIRST_INDEX, LAST_INDEX); \
+        char _filename2[64];                                  \
+        sprintf(_filename2, "open-%u", (unsigned)COUNTER);    \
+        munit_assert_true(DirHasFile(f->dir, _filename1));    \
+        munit_assert_false(DirHasFile(f->dir, _filename2));   \
+        DirRenameFile(f->dir, _filename1, _filename2);        \
+        DirGrowFile(f->dir, _filename2, SEGMENT_SIZE);        \
+    } while (0)
+
+#define LOAD_VARS                    \
+    int _rv;                         \
+    raft_term _term;                 \
+    raft_id _voted_for;              \
+    struct raft_snapshot *_snapshot; \
+    raft_index _start_index;         \
+    struct raft_entry *_entries;     \
+    size_t _n;
+
+/* Initialize the raft_io instance, then call raft_io->load() and assert that it
+ * returns the given error code and message. */
+#define LOAD_ERROR(RV, ERRMSG)                                    \
+    do {                                                          \
+        LOAD_VARS;                                                \
+        SETUP_UV;                                                 \
+        _rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, \
+                         &_start_index, &_entries, &_n);          \
+        munit_assert_int(_rv, ==, RV);                            \
+        munit_assert_string_equal(f->io.errmsg, ERRMSG);          \
+    } while (0)
+
+#define LOAD_ERROR_NO_SETUP(RV, ERRMSG)                           \
+    do {                                                          \
+        LOAD_VARS;                                                \
+        _rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, \
+                         &_start_index, &_entries, &_n);          \
+        munit_assert_int(_rv, ==, RV);                            \
+        munit_assert_string_equal(f->io.errmsg, ERRMSG);          \
+    } while (0)
+
+#define LOAD_ERROR_NO_RECOVER(RV, ERRMSG)                         \
+    do {                                                          \
+        LOAD_VARS;                                                \
+        SETUP_UV;                                                 \
+        _rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, \
+                         &_start_index, &_entries, &_n);          \
+        munit_assert_int(_rv, ==, RV);                            \
+        munit_assert_string_equal(f->io.errmsg, ERRMSG);          \
+    } while (0)
+
+#define _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES)             \
+    _rv = f->io.load(&f->io, &_term, &_voted_for, &_snapshot, &_start_index, \
+                     &_entries, &_n);                                        \
+    munit_assert_int(_rv, ==, 0);                                            \
+    munit_assert_int(_term, ==, TERM);                                       \
+    munit_assert_int(_voted_for, ==, VOTED_FOR);                             \
+    munit_assert_int(_start_index, ==, START_INDEX);                         \
+    if (_snapshot != NULL) {                                                 \
+        struct snapshot *_expected = (struct snapshot *)(SNAPSHOT);          \
+        munit_assert_ptr_not_null(_snapshot);                                \
+        munit_assert_int(_snapshot->term, ==, _expected->term);              \
+        munit_assert_int(_snapshot->index, ==, _expected->index);            \
+        munit_assert_int(_snapshot->n_bufs, ==, 1);                          \
+        munit_assert_int(*(uint64_t *)_snapshot->bufs[0].base, ==,           \
+                         _expected->data);                                   \
+        raft_configuration_close(&_snapshot->configuration);                 \
+        raft_free(_snapshot->bufs[0].base);                                  \
+        raft_free(_snapshot->bufs);                                          \
+        raft_free(_snapshot);                                                \
+    }                                                                        \
+    if (_n != 0) {                                                           \
+        munit_assert_int(_n, ==, N_ENTRIES);                                 \
+        for (_i = 0; _i < _n; _i++) {                                        \
+            struct raft_entry *_entry = &_entries[_i];                       \
+            uint64_t _value = *(uint64_t *)_entry->buf.base;                 \
+            munit_assert_int(_value, ==, _data);                             \
+            _data++;                                                         \
+        }                                                                    \
+        for (_i = 0; _i < _n; _i++) {                                        \
+            struct raft_entry *_entry = &_entries[_i];                       \
+            if (_entry->batch != _batch) {                                   \
+                _batch = _entry->batch;                                      \
+                raft_free(_batch);                                           \
+            }                                                                \
+        }                                                                    \
+        raft_free(_entries);                                                 \
+    }
+
+/* Initialize the raft_io instance, then invoke raft_io->load() and assert that
+ * it returns the given state. If non-NULL, SNAPSHOT points to a struct snapshot
+ * object whose attributes must match the loaded snapshot. ENTRIES_DATA is
+ * supposed to be the integer stored in the data of first loaded entry. */
+#define LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, ENTRIES_DATA, N_ENTRIES) \
+    do {                                                                      \
+        LOAD_VARS;                                                            \
+        void *_batch = NULL;                                                  \
+        uint64_t _data = ENTRIES_DATA;                                        \
+        unsigned _i;                                                          \
+        SETUP_UV;                                                             \
+        _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES)              \
+    } while (0)
+
+/* Same as LOAD but with auto recovery turned on. */
+#define LOAD_WITH_AUTO_RECOVERY(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, \
+                                ENTRIES_DATA, N_ENTRIES)                \
+    do {                                                                \
+        LOAD_VARS;                                                      \
+        void *_batch = NULL;                                            \
+        uint64_t _data = ENTRIES_DATA;                                  \
+        unsigned _i;                                                    \
+        SETUP_UV;                                                       \
+        raft_uv_set_auto_recovery(&f->io, true);                        \
+        _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES)        \
+    } while (0)
+
+/* Same as LOAD without SETUP_UV */
+#define LOAD_NO_SETUP(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, ENTRIES_DATA, \
+                      N_ENTRIES)                                            \
+    do {                                                                    \
+        LOAD_VARS;                                                          \
+        void *_batch = NULL;                                                \
+        uint64_t _data = ENTRIES_DATA;                                      \
+        unsigned _i;                                                        \
+        _LOAD(TERM, VOTED_FOR, SNAPSHOT, START_INDEX, N_ENTRIES)            \
+    } while (0)
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_UV_DEPS;
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_UV;
+    TEAR_DOWN_UV_DEPS;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * raft_io->load()
+ *
+ *****************************************************************************/
+
+SUITE(load)
+
+/* Load the initial state of a pristine server. */
+TEST(load, emptyDir, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    LOAD(0,    /* term                                              */
+         0,    /* voted for                                         */
+         NULL, /* snapshot                                          */
+         1,    /* start index                                       */
+         0,    /* data for first loaded entry    */
+         0     /* n entries                                         */
+    );
+    return MUNIT_OK;
+}
+
+static char *unknownFiles[] = {
+    "garbage",
+    "0000000000000000000000000001-00000000001garbage",
+    "open-1garbage",
+    NULL,
+};
+
+static MunitParameterEnum unknownFilesParams[] = {
+    {"filename", unknownFiles},
+    {NULL, NULL},
+};
+
+/* Files that are not part of the raft state are ignored. */
+TEST(load, ignoreUnknownFiles, setUp, tearDown, 0, unknownFilesParams)
+{
+    struct fixture *f = data;
+    const char *filename = munit_parameters_get(params, "filename");
+    DirWriteFileWithZeros(f->dir, filename, 128);
+    LOAD(0,    /* term                                              */
+         0,    /* voted for                                         */
+         NULL, /* snapshot                                          */
+         1,    /* start index                                       */
+         0,    /* data for first loaded entry    */
+         0     /* n entries                                         */
+    );
+    return MUNIT_OK;
+}
+
+static char *unusableFiles[] = {"tmp-0000000001221212-0000000001221217",
+                                "tmp-snapshot-15-8260687-512469866",
+                                "snapshot-525-43326736-880259052",
+                                "snapshot-999-13371337-880259052.meta",
+                                "snapshot-20-8260687-512469866",
+                                "snapshot-88-8260687-512469866.meta",
+                                "snapshot-88-8260999-512469866.meta",
+                                "tmp-snapshot-88-8260999-512469866.meta",
+                                "tmp-snapshot-33-8260687-512469866",
+                                "snapshot-33-8260687-512469866.meta",
+                                "tmp-metadata1",
+                                "tmp-metadata2",
+                                "tmp-open1",
+                                "tmp-open13",
+                                NULL};
+
+static MunitParameterEnum unusableFilesParams[] = {
+    {"filename", unusableFiles},
+    {NULL, NULL},
+};
+
+/* Files that can no longer be used are removed. */
+TEST(load, removeUnusableFiles, setUp, tearDown, 0, unusableFilesParams)
+{
+    struct fixture *f = data;
+    const char *filename = munit_parameters_get(params, "filename");
+    DirWriteFileWithZeros(f->dir, filename, 128);
+    munit_assert_true(DirHasFile(f->dir, filename));
+    LOAD(0,    /* term                                              */
+         0,    /* voted for                                         */
+         NULL, /* snapshot                                          */
+         1,    /* start index                                       */
+         0,    /* data for first loaded entry    */
+         0     /* n entries                                         */
+    );
+    munit_assert_false(DirHasFile(f->dir, filename));
+    return MUNIT_OK;
+}
+
+/* The data directory has an empty open segment. */
+TEST(load, emptyOpenSegment, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    DirWriteFile(f->dir, "open-1", NULL, 0);
+    LOAD(0,    /* term                                              */
+         0,    /* voted for                                         */
+         NULL, /* snapshot                                          */
+         1,    /* start index                                       */
+         0,    /* data for first loaded entry    */
+         0     /* n entries                                         */
+    );
+    /* The empty segment has been removed. */
+    munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
+    return MUNIT_OK;
+}
+
+/* The data directory has a freshly allocated open segment filled with zeros. */
+TEST(load, openSegmentWithTrailingZeros, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    DirWriteFileWithZeros(f->dir, "open-1", 256);
+    LOAD(0,    /* term                                              */
+         0,    /* voted for                                         */
+         NULL, /* snapshot                                          */
+         1,    /* start index                                       */
+         0,    /* data for first loaded entry    */
+         0     /* n entries                                         */
+    );
+    /* The empty segment has been removed. */
+    munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
+    return MUNIT_OK;
+}
+
+/* The data directory has a valid closed and open segments. */
+TEST(load, bothOpenAndClosedSegments, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(2, 1);
+    APPEND(1, 3);
+    APPEND(1, 4);
+    UNFINALIZE(4, 4, 1);
+    LOAD(0,    /* term                                              */
+         0,    /* voted for                                         */
+         NULL, /* snapshot                                          */
+         1,    /* start index                                       */
+         1,    /* data for first loaded entry    */
+         4     /* n entries                                         */
+    );
+    return MUNIT_OK;
+}
+
+/* The data directory has an allocated open segment which contains non-zero
+ * corrupted data in its second batch. */
+TEST(load, openSegmentWithNonZeroData, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint64_t corrupt = 123456789;
+    APPEND(2, 1);
+    UNFINALIZE(1, 2, 1);
+    DirOverwriteFile(f->dir, "open-1", &corrupt, sizeof corrupt, 60);
+    LOAD(0,    /* term                                              */
+         0,    /* voted for                                         */
+         NULL, /* snapshot                                          */
+         1,    /* start index                                       */
+         1,    /* data for first loaded entry    */
+         1     /* n entries                                         */
+    );
+
+    /* The segment has been removed. */
+    munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
+
+    return MUNIT_OK;
+}
+
+/* The data directory has an open segment with a partially written batch that
+ * needs to be truncated. */
+TEST(load, openSegmentWithIncompleteBatch, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t zero[256];
+    APPEND(2, 1);
+    UNFINALIZE(1, 2, 1);
+    memset(zero, 0, sizeof zero);
+    DirOverwriteFile(f->dir, "open-1", &zero, sizeof zero, 62);
+    LOAD(0,    /* term                                              */
+         0,    /* voted for                                         */
+         NULL, /* snapshot                                          */
+         1,    /* start index                                       */
+         1,    /* data for first loaded entry    */
+         1     /* n entries                                         */
+    );
+    return MUNIT_OK;
+}
+
+/* The data directory has an open segment whose first batch is only
+ * partially written. In that case the segment gets removed. */
+TEST(load, openSegmentWithIncompleteFirstBatch, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t buf[4 * WORD_SIZE] = {
+        1, 0, 0, 0, 0, 0, 0, 0, /* Format version */
+        0, 0, 0, 0, 0, 0, 0, 0, /* CRC32 checksums */
+        0, 0, 0, 0, 0, 0, 0, 0, /* Number of entries */
+        0, 0, 0, 0, 0, 0, 0, 0  /* Batch data */
+    };
+    APPEND(1, 1);
+    UNFINALIZE(1, 1, 1);
+
+    DirOverwriteFile(f->dir, "open-1", buf, sizeof buf, 0);
+
+    LOAD(0,    /* term                                              */
+         0,    /* voted for                                         */
+         NULL, /* snapshot                                          */
+         1,    /* start index                                       */
+         0,    /* data for first loaded entry    */
+         0     /* n entries                                         */
+    );
+
+    return MUNIT_OK;
+}
+
+/* The data directory has two segments, with the second having an entry. */
+TEST(load, twoOpenSegments, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1, 1);
+    APPEND(1, 2);
+    UNFINALIZE(1, 1, 1);
+    UNFINALIZE(2, 2, 2);
+
+    LOAD(0,    /* term                                              */
+         0,    /* voted for                                         */
+         NULL, /* snapshot                                          */
+         1,    /* start index                                       */
+         1,    /* data for first loaded entry    */
+         2     /* n entries                                         */
+    );
+
+    /* The first and second segments have been renamed. */
+    munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
+    munit_assert_false(HAS_OPEN_SEGMENT_FILE(2));
+    munit_assert_true(HAS_CLOSED_SEGMENT_FILE(1, 1));
+    munit_assert_true(HAS_CLOSED_SEGMENT_FILE(2, 2));
+
+    return MUNIT_OK;
+}
+
+/* The data directory has two open segments, with the second one filled with
+ * zeros. */
+TEST(load, secondOpenSegmentIsAllZeros, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1, 1);
+    UNFINALIZE(1, 1, 1);
+    DirWriteFileWithZeros(f->dir, "open-2", SEGMENT_SIZE);
+
+    LOAD(0,    /* term                                              */
+         0,    /* voted for                                         */
+         NULL, /* snapshot                                          */
+         1,    /* start index                                       */
+         1,    /* data for first loaded entry    */
+         1     /* n entries                                         */
+    );
+
+    /* The first segment has been renamed. */
+    munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
+    munit_assert_true(HAS_CLOSED_SEGMENT_FILE(1, 1));
+
+    /* The second segment has been removed. */
+    munit_assert_false(HAS_OPEN_SEGMENT_FILE(2));
+
+    return MUNIT_OK;
+}
+
+/* The data directory has two open segments, the first one has a corrupt header
+ * and auto-recovery is on. */
+TEST(load, twoOpenSegmentsFirstCorruptAutoRecovery, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1, 1);
+    UNFINALIZE(1, 1, 1);
+    DirWriteFileWithZeros(f->dir, "open-2", SEGMENT_SIZE);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+    /* Load is successful and equals pristine condition. */
+    LOAD_WITH_AUTO_RECOVERY(0,    /* term                           */
+                            0,    /* voted for                      */
+                            NULL, /* snapshot                       */
+                            1,    /* start index                    */
+                            0,    /* data for first loaded entry    */
+                            0     /* n entries                      */
+    );
+
+    /* The open segments are renamed, and there is no closed segment. */
+    munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
+    munit_assert_false(HAS_OPEN_SEGMENT_FILE(2));
+    munit_assert_false(HAS_CLOSED_SEGMENT_FILE(1, 1));
+
+    return MUNIT_OK;
+}
+
+/* The data directory has two open segments, the first one has a corrupt header.
+ */
+TEST(load, twoOpenSegmentsFirstCorrupt, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1, 1);
+    UNFINALIZE(1, 1, 1);
+    DirWriteFileWithZeros(f->dir, "open-2", SEGMENT_SIZE);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load open segment open-1: unexpected format version 0");
+
+    /* The open segments are renamed, and there is no closed segment. */
+    munit_assert_true(HAS_OPEN_SEGMENT_FILE(1));
+    munit_assert_true(HAS_OPEN_SEGMENT_FILE(2));
+    return MUNIT_OK;
+}
+
+/* The data directory has a valid open segment. */
+TEST(load, openSegment, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1, 1);
+    UNFINALIZE(1, 1, 1);
+    LOAD(0,    /* term                                              */
+         0,    /* voted for                                         */
+         NULL, /* snapshot                                          */
+         1,    /* start index                                       */
+         1,    /* data for first loaded entry    */
+         1     /* n entries                                         */
+    );
+    return MUNIT_OK;
+}
+
+/* There is exactly one snapshot and no segments.  */
+TEST(load, onlyOneSnapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        1, /* index */
+        1  /* data */
+    };
+    SNAPSHOT_PUT(1, 1, 1);
+    LOAD(0,         /* term */
+         0,         /* voted for */
+         &snapshot, /* snapshot */
+         2,         /* start index */
+         0,         /* data for first loaded entry */
+         0          /* n entries */
+    );
+    return MUNIT_OK;
+}
+
+/* There are several snapshots, including an incomplete one. The last one is
+ * loaded and the incomplete or older ones are removed.  */
+TEST(load, manySnapshots, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        2, /* term */
+        9, /* index */
+        4  /* data */
+    };
+    char filename[64];
+    uint64_t now;
+
+    /* Take a snapshot but then remove the data file, as if the server crashed
+     * before it could complete writing it. */
+    uv_update_time(&f->loop);
+    now = uv_now(&f->loop);
+    sprintf(filename, "snapshot-1-8-%ju", now);
+    SNAPSHOT_PUT(1, 8, 1);
+    DirRemoveFile(f->dir, filename);
+
+    SNAPSHOT_PUT(1, 8, 2);
+    SNAPSHOT_PUT(2, 6, 3);
+    SNAPSHOT_PUT(2, 9, 4);
+    LOAD(0,         /* term */
+         0,         /* voted for */
+         &snapshot, /* snapshot */
+         10,        /* start index */
+         0,         /* data for first loaded entry */
+         0          /* n entries */
+    );
+
+    /* The orphaned .meta file is removed */
+    char meta_filename[128];
+    sprintf(meta_filename, "%s%s", filename, UV__SNAPSHOT_META_SUFFIX);
+    munit_assert_false(DirHasFile(f->dir, meta_filename));
+
+    return MUNIT_OK;
+}
+
+/* There are two snapshots, but the last one has an empty data file. The first
+ * one is loaded and the empty one is discarded.  */
+TEST(load, emptySnapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        4, /* index */
+        1  /* data */
+    };
+    char filename[64];
+    uint64_t now;
+
+    SNAPSHOT_PUT(1, 4, 1);
+
+    /* Take a snapshot but then truncate the data file, as if the server ran out
+     * of space before it could write it. */
+    uv_update_time(&f->loop);
+    now = uv_now(&f->loop);
+    sprintf(filename, "snapshot-2-6-%ju", now);
+    SNAPSHOT_PUT(2, 6, 2);
+    DirTruncateFile(f->dir, filename, 0);
+
+    LOAD(0,         /* term */
+         0,         /* voted for */
+         &snapshot, /* snapshot */
+         5,         /* start index */
+         0,         /* data for first loaded entry */
+         0          /* n entries */
+    );
+
+    return MUNIT_OK;
+}
+
+/* There is an orphaned snapshot and an orphaned snapshot .meta file,
+ * make sure they are removed */
+TEST(load, orphanedSnapshotFiles, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uv_update_time(&f->loop);
+    uint64_t now = uv_now(&f->loop);
+
+    struct snapshot expected_snapshot = {
+        2,  /* term */
+        16, /* index */
+        4   /* data */
+    };
+
+    char filename1_removed[64];
+    char metafilename1_removed[64];
+    char filename2_removed[64];
+    char metafilename2_removed[64];
+
+    /* Take a snapshot but then remove the data file, as if the server crashed
+     * before it could complete writing it. */
+    sprintf(filename1_removed, "snapshot-2-18-%ju", now);
+    sprintf(metafilename1_removed, "snapshot-2-18-%ju%s", now,
+            UV__SNAPSHOT_META_SUFFIX);
+    SNAPSHOT_PUT(2, 18, 1);
+    munit_assert_true(DirHasFile(f->dir, filename1_removed));
+    munit_assert_true(DirHasFile(f->dir, metafilename1_removed));
+    DirRemoveFile(f->dir, filename1_removed);
+
+    /* Take a snapshot but then remove the .meta file */
+    now = uv_now(&f->loop);
+    sprintf(filename2_removed, "snapshot-2-19-%ju", now);
+    sprintf(metafilename2_removed, "snapshot-2-19-%ju%s", now,
+            UV__SNAPSHOT_META_SUFFIX);
+    SNAPSHOT_PUT(2, 19, 2);
+    munit_assert_true(DirHasFile(f->dir, filename2_removed));
+    munit_assert_true(DirHasFile(f->dir, metafilename2_removed));
+    DirRemoveFile(f->dir, metafilename2_removed);
+
+    /* Take a valid snapshot and make sure it's loaded */
+    SNAPSHOT_PUT(2, 16, 4);
+    LOAD(0,                  /* term */
+         0,                  /* voted for */
+         &expected_snapshot, /* snapshot */
+         17,                 /* start index */
+         0,                  /* data for first loaded entry */
+         0                   /* n entries */
+    );
+
+    /* The orphaned files are removed */
+    munit_assert_false(DirHasFile(f->dir, metafilename1_removed));
+    munit_assert_false(DirHasFile(f->dir, filename2_removed));
+    return MUNIT_OK;
+}
+
+/* The data directory has a closed segment with entries that are no longer
+ * needed, since they are included in a snapshot. We still keep those segments
+ * and just let the next snapshot logic delete them. */
+TEST(load, closedSegmentWithEntriesBehindSnapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        2, /* index */
+        1  /* data */
+    };
+    APPEND(1, 1);
+    SNAPSHOT_PUT(1, 2, 1);
+    LOAD(0,         /* term */
+         0,         /* voted for */
+         &snapshot, /* snapshot */
+         3,         /* start index */
+         0,         /* data for first loaded entry */
+         0          /* n entries */
+    );
+    munit_assert_true(HAS_CLOSED_SEGMENT_FILE(1, 1));
+    return MUNIT_OK;
+}
+
+/* The data directory has a closed segment with entries that are no longer
+ * needed, since they are included in a snapshot. However it also has an open
+ * segment that has enough entries to reach the snapshot last index. */
+TEST(load, openSegmentWithEntriesPastSnapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        2, /* index */
+        1  /* data */
+    };
+    APPEND(1, 1);
+    APPEND(1, 2);
+    SNAPSHOT_PUT(1, 2, 1);
+    UNFINALIZE(2, 2, 1);
+    LOAD(0,         /* term */
+         0,         /* voted for */
+         &snapshot, /* snapshot */
+         1,         /* start index */
+         1,         /* data for first loaded entry */
+         2          /* n entries */
+    );
+    return MUNIT_OK;
+}
+
+/* The data directory has a closed segment whose filename encodes a number of
+ * entries which is different then ones it actually contains. */
+TEST(load, closedSegmentWithInconsistentFilename, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(3, 1);
+    DirRenameFile(f->dir, "0000000000000001-0000000000000003",
+                  "0000000000000001-0000000000000004");
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load closed segment 0000000000000001-0000000000000004: found 3 "
+               "entries (expected 4)");
+    return MUNIT_OK;
+}
+
+/* The data directory has a closed segment whose filename encodes a number of
+ * entries which is different then ones it actually contains, and auto-recovery
+ * is turned on. */
+TEST(load,
+     closedSegmentWithInconsistentFilenameAutoRecovery,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    APPEND(3, 1);
+    DirRenameFile(f->dir, "0000000000000001-0000000000000003",
+                  "0000000000000001-0000000000000004");
+    /* Load in pristine condition */
+    LOAD_WITH_AUTO_RECOVERY(0,    /* term */
+                            0,    /* voted for */
+                            NULL, /* snapshot */
+                            1,    /* start index */
+                            0,    /* data for first loaded entry */
+                            0     /* n entries */
+    );
+    return MUNIT_OK;
+}
+
+/* The data directory has a closed segment with entries that are no longer
+ * needed, since they are included in a snapshot. It also has an open segment,
+ * however that does not have enough entries to reach the snapshot last
+ * index. */
+TEST(load, openSegmentWithEntriesBehindSnapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1, 1);
+    APPEND(1, 2);
+    SNAPSHOT_PUT(1, 3, 1);
+    UNFINALIZE(2, 2, 1);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "last entry on disk has index 2, which is behind last "
+               "snapshot's index 3");
+    return MUNIT_OK;
+}
+
+/* The data directory has a closed segment with entries that are no longer
+ * needed, since they are included in a snapshot. It also has an open segment,
+ * however that does not have enough entries to reach the snapshot last
+ * index, and auto-receovery is turned on. */
+TEST(load,
+     openSegmentWithEntriesBehindSnapshotAutoRecovery,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        3, /* index */
+        1  /* data */
+    };
+    APPEND(1, 1);
+    APPEND(1, 2);
+    SNAPSHOT_PUT(1, 3, 1);
+    UNFINALIZE(2, 2, 1);
+    LOAD_WITH_AUTO_RECOVERY(0,         /* term */
+                            0,         /* voted for */
+                            &snapshot, /* snapshot */
+                            4,         /* start index */
+                            0,         /* data for first loaded entry */
+                            0          /* n entries */
+    );
+    return MUNIT_OK;
+}
+
+/* The data directory contains a snapshot and an open segment containing a valid
+ * entry, and no closed segments. */
+TEST(load, openSegmentNoClosedSegmentsSnapshotPresent, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        3, /* index */
+        1  /* data */
+    };
+    SNAPSHOT_PUT(1, 3, 1);
+    APPEND(1, 4);
+    UNFINALIZE(4, 4, 1);
+    LOAD(0,         /* term */
+         0,         /* voted for */
+         &snapshot, /* snapshot */
+         4,         /* start index */
+         4,         /* data for first loaded entry */
+         1          /* n entries */
+    );
+    return MUNIT_OK;
+}
+
+/* The data directory contains a snapshot and an open segment with a corrupt
+ * format header and no closed segments. */
+TEST(load,
+     corruptOpenSegmentNoClosedSegmentsSnapshotPresent,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    SNAPSHOT_PUT(1, 3, 1);
+    APPEND(1, 4);
+    UNFINALIZE(4, 4, 1);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load open segment open-1: unexpected format version 0");
+    return MUNIT_OK;
+}
+
+/* The data directory contains a snapshot and an open segment with a corrupt
+ * format header and no closed segments. Auto-recovery is turned on. */
+TEST(load,
+     corruptOpenSegmentNoClosedSegmentsSnapshotPresentWithAutoRecovery,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        3, /* index */
+        1  /* data */
+    };
+    SNAPSHOT_PUT(1, 3, 1);
+    APPEND(1, 4);
+    UNFINALIZE(4, 4, 1);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+    /* Load is successful. */
+    LOAD_WITH_AUTO_RECOVERY(0,         /* term */
+                            0,         /* voted for */
+                            &snapshot, /* snapshot */
+                            4,         /* start index */
+                            1,         /* data for first loaded entry */
+                            1          /* n entries */
+    );
+    return MUNIT_OK;
+}
+
+/* The data directory contains a snapshot and an open segment with a corrupt
+ * format header and a closed segment. */
+TEST(load,
+     corruptOpenSegmentClosedSegmentSnapshotPresent,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    SNAPSHOT_PUT(1, 3, 1);
+    APPEND(1, 4);
+    APPEND(1, 5);
+    UNFINALIZE(5, 5, 1);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load open segment open-1: unexpected format version 0");
+    return MUNIT_OK;
+}
+
+/* The data directory contains a snapshot and an open segment with a corrupt
+ * format header and a closed segment. Auto-recovery is turned on. */
+TEST(load,
+     corruptOpenSegmentClosedSegmentSnapshotPresentWithAutoRecovery,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        3, /* index */
+        1  /* data */
+    };
+    SNAPSHOT_PUT(1, 3, 1);
+    APPEND(1, 4);
+    APPEND(1, 5);
+    UNFINALIZE(5, 5, 1);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+
+    /* Load is successful. */
+    LOAD_WITH_AUTO_RECOVERY(0,         /* term */
+                            0,         /* voted for */
+                            &snapshot, /* snapshot */
+                            4,         /* start index */
+                            4,         /* data for first loaded entry */
+                            1          /* n entries */
+    );
+
+    /* Open segment has been renamed */
+    munit_assert_false(DirHasFile(f->dir, "open-1"));
+    return MUNIT_OK;
+}
+
+/* The data directory contains a snapshot and an open segment with a corrupt
+ * format header and multiple closed segment. Auto-recovery is turned on. */
+TEST(load,
+     corruptOpenSegmentClosedSegmentsSnapshotPresentWithAutoRecovery,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        3, /* index */
+        1  /* data */
+    };
+    SNAPSHOT_PUT(1, 3, 1);
+    APPEND(1, 4);
+    APPEND(1, 5);
+    APPEND(1, 6);
+    UNFINALIZE(6, 6, 1);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+
+    LOAD_WITH_AUTO_RECOVERY(0,         /* term */
+                            0,         /* voted for */
+                            &snapshot, /* snapshot */
+                            4,         /* start index */
+                            4,         /* data for first loaded entry */
+                            2          /* n entries */
+    );
+    /* Open segment has been renamed during the first load */
+    munit_assert_false(DirHasFile(f->dir, "open-1"));
+    return MUNIT_OK;
+}
+
+/* The data directory contains a snapshot and an open segment with a corrupt
+ * format header and multiple closed segment. */
+TEST(load,
+     corruptOpenSegmentClosedSegmentsSnapshotPresent,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    SNAPSHOT_PUT(1, 3, 1);
+    APPEND(1, 4);
+    APPEND(1, 5);
+    APPEND(1, 6);
+    UNFINALIZE(6, 6, 1);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load open segment open-1: unexpected format version 0");
+    return MUNIT_OK;
+}
+
+/* The data directory contains a closed segment and an open segment with a
+ * corrupt format header and no snapshot. */
+TEST(load, corruptOpenSegmentClosedSegments, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(4, 1);
+    APPEND(1, 5);
+    UNFINALIZE(5, 5, 1);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load open segment open-1: unexpected format version 0");
+    return MUNIT_OK;
+}
+
+/* The data directory contains a closed segment and an open segment with a
+ * corrupt format header and no snapshot. Auto-recovery is turned on. */
+TEST(load,
+     corruptOpenSegmentClosedSegmentsWithAutoRecovery,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    APPEND(4, 1);
+    APPEND(1, 5);
+    UNFINALIZE(5, 5, 1);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+    /* load is successful. */
+    LOAD_WITH_AUTO_RECOVERY(0,    /* term */
+                            0,    /* voted for */
+                            NULL, /* snapshot */
+                            1,    /* start index */
+                            1,    /* data for first loaded entry */
+                            4     /* n entries */
+    );
+    /* Open segment has been renamed */
+    munit_assert_false(DirHasFile(f->dir, "open-1"));
+    return MUNIT_OK;
+}
+
+/* The data directory contains a closed segment and two open segments.
+ * The first open segment has a corrupt header. */
+TEST(load, corruptOpenSegmentsClosedSegments, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(3, 1);
+    APPEND(1, 4);
+    APPEND(1, 5);
+    UNFINALIZE(4, 4, 1);
+    UNFINALIZE(5, 5, 2);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load open segment open-1: unexpected format version 0");
+
+    return MUNIT_OK;
+}
+
+/* The data directory contains a closed segment and two open segments.
+ * The first open segment has a corrupt header. Auto-recovery is turned on. */
+TEST(load,
+     corruptOpenSegmentsClosedSegmentsWithAutoRecovery,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    APPEND(3, 1);
+    APPEND(1, 4);
+    APPEND(1, 5);
+    UNFINALIZE(4, 4, 1);
+    UNFINALIZE(5, 5, 2);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+
+    LOAD_WITH_AUTO_RECOVERY(0,    /* term */
+                            0,    /* voted for */
+                            NULL, /* snapshot */
+                            1,    /* start index */
+                            1,    /* data for first loaded entry */
+                            3     /* n entries */
+    );
+
+    /* Open segments have been renamed */
+    munit_assert_false(DirHasFile(f->dir, "open-1"));
+    munit_assert_false(DirHasFile(f->dir, "open-2"));
+    return MUNIT_OK;
+}
+
+/* The data directory contains a closed segment and two open segments.
+ * The second open segment has a corrupt header. */
+TEST(load, corruptLastOpenSegmentClosedSegments, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(3, 1);
+    APPEND(1, 4);
+    APPEND(1, 5);
+    UNFINALIZE(4, 4, 1);
+    UNFINALIZE(5, 5, 2);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-2", &version, sizeof version, 0);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load open segment open-2: unexpected format version 0");
+
+    return MUNIT_OK;
+}
+
+/* The data directory contains a closed segment and two open segments.
+ * The second open segment has a corrupt header. Auto-recovery is turned on. */
+TEST(load,
+     corruptLastOpenSegmentClosedSegmentsWithAutoRecovery,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    APPEND(3, 1);
+    APPEND(1, 4);
+    APPEND(1, 5);
+    UNFINALIZE(4, 4, 1);
+    UNFINALIZE(5, 5, 2);
+
+    /* Corrupt open segment */
+    uint64_t version = 0 /* Format version */;
+    DirOverwriteFile(f->dir, "open-2", &version, sizeof version, 0);
+
+    LOAD_WITH_AUTO_RECOVERY(0,    /* term */
+                            0,    /* voted for */
+                            NULL, /* snapshot */
+                            1,    /* start index */
+                            1,    /* data for first loaded entry */
+                            4     /* n entries */
+    );
+    /* Open segment has been renamed during the first load */
+    munit_assert_false(DirHasFile(f->dir, "open-2"));
+    return MUNIT_OK;
+}
+
+/* The data directory has several closed segments, all with entries compatible
+ * with the snapshot. */
+TEST(load, closedSegmentsOverlappingWithSnapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        4, /* index */
+        1  /* data */
+    };
+    APPEND(1, 1);
+    APPEND(2, 2);
+    APPEND(3, 4);
+    SNAPSHOT_PUT(1, 4, 1);
+    LOAD(0,         /* term */
+         0,         /* voted for */
+         &snapshot, /* snapshot */
+         1,         /* start index */
+         1,         /* data for first loaded entry */
+         6          /* n entries */
+    );
+    return MUNIT_OK;
+}
+
+/* The data directory has several closed segments, the last of which is corrupt.
+ * There is a snapshot. */
+TEST(load,
+     closedSegmentsWithSnapshotLastSegmentCorrupt,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    SNAPSHOT_PUT(1, 4, 1);
+    APPEND(1, 5);
+    APPEND(2, 6);
+    APPEND(2, 8);
+
+    /* Corrupt the last closed segment */
+    size_t offset =
+        WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */;
+    uint32_t corrupted = 123456789;
+    DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 9), &corrupted,
+                     sizeof corrupted, offset);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load closed segment 0000000000000008-0000000000000009: entries "
+               "batch 1 starting at byte 8: data checksum mismatch");
+    return MUNIT_OK;
+}
+
+/* The data directory has several closed segments, the last of which is corrupt.
+ * There is a snapshot. Auto-recovery is turned on. */
+TEST(load,
+     closedSegmentsWithSnapshotLastSegmentCorruptAutoRecovery,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        4, /* index */
+        1  /* data */
+    };
+    SNAPSHOT_PUT(1, 4, 1);
+    APPEND(1, 5);
+    APPEND(2, 6);
+    APPEND(2, 8);
+
+    /* Corrupt the last closed segment */
+    size_t offset =
+        WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */;
+    uint32_t corrupted = 123456789;
+    DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 9), &corrupted,
+                     sizeof corrupted, offset);
+    LOAD_WITH_AUTO_RECOVERY(0,         /* term */
+                            0,         /* voted for */
+                            &snapshot, /* snapshot */
+                            5,         /* start index */
+                            5,         /* data for first loaded entry */
+                            3          /* n entries */
+    );
+    return MUNIT_OK;
+}
+
+/* The data directory has several closed segments, the last of which is corrupt.
+ * There is an open segment and a snapshot. Auto-recovery is turned on. */
+TEST(load,
+     closedSegmentsWithSnapshotLastSegmentCorruptOpenSegmentWithAutoRecovery,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        4, /* index */
+        1  /* data */
+    };
+    SNAPSHOT_PUT(1, 4, 1);
+    APPEND(1, 5);
+    APPEND(2, 6);
+    APPEND(1, 8);
+    APPEND(1, 9);
+    UNFINALIZE(9, 9, 1);
+
+    /* Corrupt the last closed segment */
+    size_t offset =
+        WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */;
+    uint32_t corrupted = 123456789;
+    DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 8), &corrupted,
+                     sizeof corrupted, offset);
+    munit_assert_true(HAS_OPEN_SEGMENT_FILE(1));
+
+    LOAD_WITH_AUTO_RECOVERY(0,         /* term */
+                            0,         /* voted for */
+                            &snapshot, /* snapshot */
+                            5,         /* start index */
+                            5,         /* data for first loaded entry */
+                            3          /* n entries */
+    );
+    munit_assert_false(HAS_OPEN_SEGMENT_FILE(1));
+    return MUNIT_OK;
+}
+
+/* The data directory has several closed segments, the last of which is corrupt.
+ * There is an open segment and a snapshot. */
+TEST(load,
+     closedSegmentsWithSnapshotLastSegmentCorruptOpenSegment,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    SNAPSHOT_PUT(1, 4, 1);
+    APPEND(1, 5);
+    APPEND(2, 6);
+    APPEND(1, 8);
+    APPEND(1, 9);
+    UNFINALIZE(9, 9, 1);
+
+    /* Corrupt the last closed segment */
+    size_t offset =
+        WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */;
+    uint32_t corrupted = 123456789;
+    DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(8, 8), &corrupted,
+                     sizeof corrupted, offset);
+    munit_assert_true(HAS_OPEN_SEGMENT_FILE(1));
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load closed segment 0000000000000008-0000000000000008: entries "
+               "batch 1 starting at byte 8: data checksum mismatch");
+    return MUNIT_OK;
+}
+
+/* The data directory has several closed segments, the second to last one of
+ * which is corrupt. There is a snapshot. */
+TEST(load,
+     closedSegmentsWithSnapshotSecondLastSegmentCorrupt,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    SNAPSHOT_PUT(1, 4, 1);
+    APPEND(1, 5);
+    APPEND(2, 6);
+    APPEND(2, 8);
+
+    /* Corrupt the second last closed segment */
+    size_t offset =
+        WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */;
+    uint32_t corrupted = 123456789;
+    DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(6, 7), &corrupted,
+                     sizeof corrupted, offset);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load closed segment 0000000000000006-0000000000000007: entries "
+               "batch 1 starting at byte 8: data checksum mismatch");
+
+    /* Second load still fails. */
+    LOAD_ERROR_NO_SETUP(
+        RAFT_CORRUPT,
+        "load closed segment 0000000000000006-0000000000000007: entries "
+        "batch 1 starting at byte 8: data checksum mismatch");
+
+    return MUNIT_OK;
+}
+
+/* The data directory has several closed segments, some of which have a gap,
+ * which is still compatible with the snapshot. */
+TEST(load, nonContiguousClosedSegments, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct snapshot snapshot = {
+        1, /* term */
+        4, /* index */
+        1  /* data */
+    };
+    APPEND(1, 1);
+    APPEND(2, 2);
+    APPEND(3, 4);
+    SNAPSHOT_PUT(1, 4, 1);
+    DirRemoveFile(f->dir, CLOSED_SEGMENT_FILENAME(2, 3));
+    LOAD(0,         /* term */
+         0,         /* voted for */
+         &snapshot, /* snapshot */
+         4,         /* start index */
+         4,         /* data for first loaded entry */
+         3          /* n entries */
+    );
+    return MUNIT_OK;
+}
+
+/* If the data directory has a closed segment whose start index is beyond the
+ * snapshot's last index, an error is returned. */
+TEST(load, closedSegmentWithEntriesPastSnapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint64_t now;
+    char errmsg[128];
+    APPEND(5, 1);
+    APPEND(1, 5);
+    uv_update_time(&f->loop);
+    now = uv_now(&f->loop);
+    sprintf(errmsg,
+            "closed segment 0000000000000006-0000000000000006 is past last "
+            "snapshot snapshot-1-4-%ju",
+            now);
+    SNAPSHOT_PUT(1, 4, 1);
+    DirRemoveFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 5));
+    LOAD_ERROR(RAFT_CORRUPT, errmsg);
+    return MUNIT_OK;
+}
+
+/* The data directory has an open segment which has incomplete format data. */
+TEST(load, openSegmentWithIncompleteFormat, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    DirWriteFileWithZeros(f->dir, "open-1", WORD_SIZE / 2);
+    LOAD_ERROR(RAFT_IOERR, "load open segment open-1: file has only 4 bytes");
+    return MUNIT_OK;
+}
+
+/* The data directory has an open segment which has an incomplete batch
+ * preamble. */
+TEST(load, openSegmentWithIncompletePreamble, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    size_t offset = WORD_SIZE /* Format version */ + WORD_SIZE /* Checksums */;
+    APPEND(1, 1);
+    UNFINALIZE(1, 1, 1);
+    DirTruncateFile(f->dir, "open-1", offset);
+    LOAD_ERROR(RAFT_IOERR,
+               "load open segment open-1: entries batch 1 starting at byte 16: "
+               "read preamble: short read: 0 bytes instead of 8");
+    return MUNIT_OK;
+}
+
+/* The data directory has an open segment which has incomplete batch header. */
+TEST(load, openSegmentWithIncompleteBatchHeader, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    size_t offset = WORD_SIZE + /* Format version */
+                    WORD_SIZE + /* Checksums */
+                    WORD_SIZE + /* Number of entries */
+                    WORD_SIZE /* Partial batch header */;
+
+    APPEND(1, 1);
+    UNFINALIZE(1, 1, 1);
+    DirTruncateFile(f->dir, "open-1", offset);
+    LOAD_ERROR(RAFT_IOERR,
+               "load open segment open-1: entries batch 1 starting at byte 8: "
+               "read header: short read: 8 bytes instead of 16");
+    return MUNIT_OK;
+}
+
+/* The data directory has an open segment which has incomplete batch data. */
+TEST(load, openSegmentWithIncompleteBatchData, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    size_t offset = WORD_SIZE + /* Format version */
+                    WORD_SIZE + /* Checksums */
+                    WORD_SIZE + /* Number of entries */
+                    WORD_SIZE + /* Entry term */
+                    WORD_SIZE + /* Entry type and data size */
+                    WORD_SIZE / 2 /* Partial entry data */;
+
+    APPEND(1, 1);
+    UNFINALIZE(1, 1, 1);
+    DirTruncateFile(f->dir, "open-1", offset);
+    LOAD_ERROR(RAFT_IOERR,
+               "load open segment open-1: entries batch 1 starting at byte 8: "
+               "read data: short read: 4 bytes instead of 8");
+    return MUNIT_OK;
+}
+
+/* The data directory has a closed segment which has corrupted batch header. */
+TEST(load, closedSegmentWithCorruptedBatchHeader, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    size_t offset = WORD_SIZE /* Format version */;
+    uint64_t corrupted = 12345678;
+    APPEND(1, 1);
+    DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), &corrupted,
+                     sizeof corrupted, offset);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load closed segment 0000000000000001-0000000000000001: entries "
+               "batch 1 starting at byte 8: header checksum mismatch");
+    return MUNIT_OK;
+}
+
+/* The data directory has a closed segment which has corrupted batch data. */
+TEST(load, closedSegmentWithCorruptedBatchData, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    size_t offset =
+        WORD_SIZE /* Format version */ + WORD_SIZE / 2 /* Header checksum */;
+    uint32_t corrupted = 123456789;
+    APPEND(1, 1);
+    DirOverwriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), &corrupted,
+                     sizeof corrupted, offset);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load closed segment 0000000000000001-0000000000000001: entries "
+               "batch 1 starting at byte 8: data checksum mismatch");
+    return MUNIT_OK;
+}
+
+/* The data directory has a closed segment whose first index does not match what
+ * we expect. */
+TEST(load, closedSegmentWithBadIndex, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1, 1);
+    APPEND(1, 2);
+    DirRemoveFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1));
+    LOAD_ERROR(RAFT_CORRUPT,
+               "unexpected closed segment 0000000000000002-0000000000000002: "
+               "first index should have been 1");
+    return MUNIT_OK;
+}
+
+/* The data directory has an empty closed segment. */
+TEST(load, emptyClosedSegment, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    DirWriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), NULL, 0);
+    LOAD_ERROR(
+        RAFT_CORRUPT,
+        "load closed segment 0000000000000001-0000000000000001: file is empty");
+    return MUNIT_OK;
+}
+
+/* The data directory has a closed segment with an unexpected format. */
+TEST(load, closedSegmentWithBadFormat, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t buf[8] = {2, 0, 0, 0, 0, 0, 0, 0};
+    DirWriteFile(f->dir, CLOSED_SEGMENT_FILENAME(1, 1), buf, sizeof buf);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load closed segment 0000000000000001-0000000000000001: "
+               "unexpected format version 2");
+    return MUNIT_OK;
+}
+
+/* The data directory has an open segment which is not readable. */
+TEST(load, openSegmentWithNoAccessPermission, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+
+    /* Skip the test when running as root, since EACCES would not be triggered
+     * in that case. */
+    if (getuid() == 0) {
+        SETUP_UV; /* Setup the uv object since teardown expects it. */
+        return MUNIT_SKIP;
+    }
+
+    APPEND(1, 1);
+    UNFINALIZE(1, 1, 1);
+    DirMakeFileUnreadable(f->dir, "open-1");
+    LOAD_ERROR(RAFT_IOERR,
+               "load open segment open-1: read file: open: permission denied");
+    return MUNIT_OK;
+}
+
+/* The data directory has an open segment with format set to 0 and non-zero
+ * content. */
+TEST(load, openSegmentWithZeroFormatAndThenData, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint64_t version = 0 /* Format version */;
+    APPEND(1, 1);
+    UNFINALIZE(1, 1, 1);
+    DirOverwriteFile(f->dir, "open-1", &version, sizeof version, 0);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load open segment open-1: unexpected format version 0");
+    return MUNIT_OK;
+}
+
+/* The data directory has an open segment with an unexpected format. */
+TEST(load, openSegmentWithBadFormat, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t version[8] = {2, 0, 0, 0, 0, 0, 0, 0};
+    APPEND(1, 1);
+    UNFINALIZE(1, 1, 1);
+    DirOverwriteFile(f->dir, "open-1", version, sizeof version, 0);
+    LOAD_ERROR(RAFT_CORRUPT,
+               "load open segment open-1: unexpected format version 2");
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_uv_recover.c b/test/raft/integration/test_uv_recover.c
new file mode 100644
index 000000000..f1435a656
--- /dev/null
+++ b/test/raft/integration/test_uv_recover.c
@@ -0,0 +1,80 @@
+#include "../lib/runner.h"
+#include "../lib/uv.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_UV_DEPS;
+    FIXTURE_UV;
+};
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_UV_DEPS;
+    SETUP_UV;
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_UV;
+    TEAR_DOWN_UV_DEPS;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * raft_io->recover()
+ *
+ *****************************************************************************/
+
+SUITE(recover)
+
+/* Invoke recover and assert that it fails with the given error. */
+#define RECOVER_ERROR(RV, CONF)            \
+    {                                      \
+        int rv_;                           \
+        rv_ = f->io.recover(&f->io, CONF); \
+        munit_assert_int(rv_, ==, RV);     \
+    }
+
+/* Invoke recover and assert that it succeeds */
+#define RECOVER(CONF) RECOVER_ERROR(0, CONF)
+
+/* If the instance has been already initialized, an error is returned. */
+/* A new configuration is saved as last entry on disk. */
+TEST(recover, newConfiguration, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_configuration configuration1;
+    struct raft_configuration configuration2;
+    int rv;
+
+    /* Bootstrap using an initial configuration */
+    raft_configuration_init(&configuration1);
+    rv = raft_configuration_add(&configuration1, 1, "1", RAFT_VOTER);
+    munit_assert_int(rv, ==, 0);
+    rv = raft_configuration_add(&configuration1, 2, "2", RAFT_VOTER);
+    munit_assert_int(rv, ==, 0);
+    rv = f->io.bootstrap(&f->io, &configuration1);
+    munit_assert_int(rv, ==, 0);
+
+    /* Bootstrap using a different configuration */
+    raft_configuration_init(&configuration2);
+    rv = raft_configuration_add(&configuration2, 1, "1", RAFT_VOTER);
+    munit_assert_int(rv, ==, 0);
+
+    RECOVER(&configuration2);
+
+    raft_configuration_close(&configuration1);
+    raft_configuration_close(&configuration2);
+
+    return 0;
+}
diff --git a/test/raft/integration/test_uv_recv.c b/test/raft/integration/test_uv_recv.c
new file mode 100644
index 000000000..9c49394d8
--- /dev/null
+++ b/test/raft/integration/test_uv_recv.c
@@ -0,0 +1,480 @@
+#include "../lib/runner.h"
+#include "../lib/tcp.h"
+#include "../lib/uv.h"
+
+/******************************************************************************
+ *
+ * Fixture with a libuv-based raft_io instance.
+ *
+ *****************************************************************************/
+
+struct peer
+{
+    struct uv_loop_s loop;
+    struct raft_uv_transport transport;
+    struct raft_io io;
+};
+
+struct fixture
+{
+    FIXTURE_UV_DEPS;
+    FIXTURE_TCP;
+    FIXTURE_UV;
+    struct peer peer;
+    bool closed;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+struct result
+{
+    struct raft_message *message;
+    bool done;
+};
+
+static void recvCb(struct raft_io *io, struct raft_message *m1)
+{
+    struct result *result = io->data;
+    struct raft_message *m2 = result->message;
+    unsigned i;
+    munit_assert_int(m1->type, ==, m2->type);
+    switch (m1->type) {
+        case RAFT_IO_REQUEST_VOTE:
+            munit_assert_int(m1->request_vote.term, ==, m2->request_vote.term);
+            munit_assert_int(m1->request_vote.candidate_id, ==,
+                             m2->request_vote.candidate_id);
+            munit_assert_int(m1->request_vote.last_log_index, ==,
+                             m2->request_vote.last_log_index);
+            munit_assert_int(m1->request_vote.last_log_term, ==,
+                             m2->request_vote.last_log_term);
+            munit_assert_int(m1->request_vote.disrupt_leader, ==,
+                             m2->request_vote.disrupt_leader);
+            break;
+        case RAFT_IO_REQUEST_VOTE_RESULT:
+            munit_assert_int(m1->request_vote_result.term, ==,
+                             m2->request_vote_result.term);
+            munit_assert_int(m1->request_vote_result.vote_granted, ==,
+                             m2->request_vote_result.vote_granted);
+            break;
+        case RAFT_IO_APPEND_ENTRIES:
+            munit_assert_int(m1->append_entries.n_entries, ==,
+                             m2->append_entries.n_entries);
+            for (i = 0; i < m1->append_entries.n_entries; i++) {
+                struct raft_entry *entry1 = &m1->append_entries.entries[i];
+                struct raft_entry *entry2 = &m2->append_entries.entries[i];
+                munit_assert_int(entry1->term, ==, entry2->term);
+                munit_assert_int(entry1->type, ==, entry2->type);
+                munit_assert_int(entry1->buf.len, ==, entry2->buf.len);
+                munit_assert_int(
+                    memcmp(entry1->buf.base, entry2->buf.base, entry1->buf.len),
+                    ==, 0);
+            }
+            if (m1->append_entries.n_entries > 0) {
+                raft_free(m1->append_entries.entries[0].batch);
+                raft_free(m1->append_entries.entries);
+            }
+            break;
+        case RAFT_IO_APPEND_ENTRIES_RESULT:
+            munit_assert_int(m1->append_entries_result.term, ==,
+                             m2->append_entries_result.term);
+            munit_assert_int(m1->append_entries_result.rejected, ==,
+                             m2->append_entries_result.rejected);
+            munit_assert_int(m1->append_entries_result.last_log_index, ==,
+                             m2->append_entries_result.last_log_index);
+            break;
+        case RAFT_IO_INSTALL_SNAPSHOT:
+            munit_assert_int(m1->install_snapshot.conf.n, ==,
+                             m2->install_snapshot.conf.n);
+            for (i = 0; i < m1->install_snapshot.conf.n; i++) {
+                struct raft_server *s1 = &m1->install_snapshot.conf.servers[i];
+                struct raft_server *s2 = &m2->install_snapshot.conf.servers[i];
+                munit_assert_int(s1->id, ==, s2->id);
+                munit_assert_string_equal(s1->address, s2->address);
+                munit_assert_int(s1->role, ==, s2->role);
+            }
+            munit_assert_int(m1->install_snapshot.data.len, ==,
+                             m2->install_snapshot.data.len);
+            munit_assert_int(memcmp(m1->install_snapshot.data.base,
+                                    m2->install_snapshot.data.base,
+                                    m2->install_snapshot.data.len),
+                             ==, 0);
+            raft_configuration_close(&m1->install_snapshot.conf);
+            raft_free(m1->install_snapshot.data.base);
+            break;
+        case RAFT_IO_TIMEOUT_NOW:
+            munit_assert_int(m1->timeout_now.term, ==, m2->timeout_now.term);
+            munit_assert_int(m1->timeout_now.last_log_index, ==,
+                             m2->timeout_now.last_log_index);
+            munit_assert_int(m1->timeout_now.last_log_term, ==,
+                             m2->timeout_now.last_log_term);
+            break;
+    };
+    result->done = true;
+}
+
+static void peerSendCb(struct raft_io_send *req, int status)
+{
+    bool *done = req->data;
+    munit_assert_int(status, ==, 0);
+    *done = true;
+}
+
+static void peerCloseCb(struct raft_io *io)
+{
+    bool *done = io->data;
+    *done = true;
+}
+
+/* Set up the fixture's peer raft_io instance. */
+#define PEER_SETUP                                                 \
+    do {                                                           \
+        struct uv_loop_s *_loop = &f->peer.loop;                   \
+        struct raft_uv_transport *_transport = &f->peer.transport; \
+        struct raft_io *_io = &f->peer.io;                         \
+        int _rv;                                                   \
+        _rv = uv_loop_init(_loop);                                 \
+        munit_assert_int(_rv, ==, 0);                              \
+        _transport->version = 1;                                   \
+        _rv = raft_uv_tcp_init(_transport, _loop);                 \
+        munit_assert_int(_rv, ==, 0);                              \
+        _rv = raft_uv_init(_io, _loop, f->dir, _transport);        \
+        munit_assert_int(_rv, ==, 0);                              \
+        _rv = _io->init(_io, 2, "127.0.0.1:9002");                 \
+        munit_assert_int(_rv, ==, 0);                              \
+    } while (0)
+
+/* Tear down the fixture's peer raft_io instance. */
+#define PEER_TEAR_DOWN                                             \
+    do {                                                           \
+        struct uv_loop_s *_loop = &f->peer.loop;                   \
+        struct raft_uv_transport *_transport = &f->peer.transport; \
+        struct raft_io *_io = &f->peer.io;                         \
+        bool _done = false;                                        \
+        int _i;                                                    \
+        _done = false;                                             \
+        _io->data = &_done;                                        \
+        _io->close(_io, peerCloseCb);                              \
+        for (_i = 0; _i < 10; _i++) {                              \
+            if (_done) {                                           \
+                break;                                             \
+            }                                                      \
+            uv_run(_loop, UV_RUN_ONCE);                            \
+        }                                                          \
+        uv_run(_loop, UV_RUN_DEFAULT);                             \
+        munit_assert_true(_done);                                  \
+        raft_uv_close(_io);                                        \
+        raft_uv_tcp_close(_transport);                             \
+        uv_loop_close(_loop);                                      \
+    } while (0)
+
+/* Send a message to the main fixture's raft_io instance using the fixture's
+ * peer instance. */
+#define PEER_SEND(MESSAGE)                                \
+    do {                                                  \
+        struct uv_loop_s *_loop = &f->peer.loop;          \
+        struct raft_io *_io = &f->peer.io;                \
+        struct raft_io_send _req;                         \
+        bool _done = false;                               \
+        int _i;                                           \
+        int _rv;                                          \
+        (MESSAGE)->server_id = 1;                         \
+        (MESSAGE)->server_address = "127.0.0.1:9001";     \
+        _req.data = &_done;                               \
+        _rv = _io->send(_io, &_req, MESSAGE, peerSendCb); \
+        munit_assert_int(_rv, ==, 0);                     \
+        for (_i = 0; _i < 10; _i++) {                     \
+            if (_done) {                                  \
+                break;                                    \
+            }                                             \
+            uv_run(_loop, UV_RUN_ONCE);                   \
+        }                                                 \
+        munit_assert_true(_done);                         \
+    } while (0)
+
+/* Establish a connection and send an handshake using plain TCP. */
+#define PEER_HANDSHAKE                                             \
+    do {                                                           \
+        uint8_t _handshake[] = {                                   \
+            6, 6, 6, 0, 0, 0, 0, 0, /* Protocol */                 \
+            1, 0, 0, 0, 0, 0, 0, 0, /* Server ID */                \
+            2, 0, 0, 0, 0, 0, 0, 0, /* Address length, in words */ \
+            0, 0, 0, 0, 0, 0, 0, 0, /* First address word */       \
+            0, 0, 0, 0, 0, 0, 0, 0  /* Second address word */      \
+        };                                                         \
+        sprintf((char *)&_handshake[24], "127.0.0.1:666");         \
+        TCP_CLIENT_CONNECT(9001);                                  \
+        TCP_CLIENT_SEND(_handshake, sizeof _handshake);            \
+    } while (0);
+
+/* Run the loop until a new message is received. Assert that the received
+ * message matches the given one. */
+#define RECV(MESSAGE)                             \
+    do {                                          \
+        struct result _result = {MESSAGE, false}; \
+        f->io.data = &_result;                    \
+        LOOP_RUN_UNTIL(&_result.done);            \
+        f->io.data = NULL;                        \
+    } while (0)
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUpDeps(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_UV_DEPS;
+    SETUP_TCP;
+    PEER_SETUP;
+    f->io.data = f;
+    f->closed = false;
+    return f;
+}
+
+static void tearDownDeps(void *data)
+{
+    struct fixture *f = data;
+    PEER_TEAR_DOWN;
+    TEAR_DOWN_TCP;
+    TEAR_DOWN_UV_DEPS;
+    free(f);
+}
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = setUpDeps(params, user_data);
+    int rv;
+    SETUP_UV;
+    f->io.data = f;
+    rv = f->io.start(&f->io, 10000, NULL, recvCb);
+    munit_assert_int(rv, ==, 0);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_UV;
+    tearDownDeps(f);
+}
+
+/******************************************************************************
+ *
+ * raft_io_recv_cb
+ *
+ *****************************************************************************/
+
+SUITE(recv)
+
+/* Receive the very first message over the connection. */
+TEST(recv, first, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_message message;
+    message.type = RAFT_IO_REQUEST_VOTE;
+    message.request_vote.candidate_id = 2;
+    message.request_vote.last_log_index = 123;
+    message.request_vote.last_log_term = 2;
+    message.request_vote.disrupt_leader = false;
+    PEER_SEND(&message);
+    RECV(&message);
+    return MUNIT_OK;
+}
+
+/* Receive the a first message then another one. */
+TEST(recv, second, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_message message;
+    message.type = RAFT_IO_REQUEST_VOTE;
+    message.request_vote.candidate_id = 2;
+    message.request_vote.last_log_index = 123;
+    message.request_vote.last_log_term = 2;
+    message.request_vote.disrupt_leader = true;
+    PEER_SEND(&message);
+    RECV(&message);
+    PEER_SEND(&message);
+    RECV(&message);
+    return MUNIT_OK;
+}
+
+/* Receive a RequestVote result message. */
+TEST(recv, requestVoteResult, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_message message;
+    message.type = RAFT_IO_REQUEST_VOTE_RESULT;
+    message.request_vote_result.term = 3;
+    message.request_vote_result.vote_granted = true;
+    message.request_vote_result.pre_vote = false;
+    PEER_SEND(&message);
+    RECV(&message);
+    return MUNIT_OK;
+}
+
+/* Receive an AppendEntries message with two entries. */
+TEST(recv, appendEntries, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entries[2];
+    struct raft_message message;
+    uint8_t data1[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+    uint8_t data2[8] = {8, 7, 6, 5, 4, 3, 2, 1};
+
+    entries[0].type = RAFT_COMMAND;
+    entries[0].buf.base = data1;
+    entries[0].buf.len = sizeof data1;
+
+    entries[1].type = RAFT_COMMAND;
+    entries[1].buf.base = data2;
+    entries[1].buf.len = sizeof data2;
+
+    message.type = RAFT_IO_APPEND_ENTRIES;
+    message.append_entries.entries = entries;
+    message.append_entries.n_entries = 2;
+
+    PEER_SEND(&message);
+    RECV(&message);
+
+    return MUNIT_OK;
+}
+
+/* Receive an AppendEntries message with no entries (i.e. an heartbeat). */
+TEST(recv, heartbeat, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_message message;
+    message.type = RAFT_IO_APPEND_ENTRIES;
+    message.append_entries.entries = NULL;
+    message.append_entries.n_entries = 0;
+    PEER_SEND(&message);
+    RECV(&message);
+    return MUNIT_OK;
+}
+
+/* Receive an AppendEntries result f->peer.message. */
+TEST(recv, appendEntriesResult, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_message message;
+    message.type = RAFT_IO_APPEND_ENTRIES_RESULT;
+    message.append_entries_result.term = 3;
+    message.append_entries_result.rejected = 0;
+    message.append_entries_result.last_log_index = 123;
+    PEER_SEND(&message);
+    RECV(&message);
+    return MUNIT_OK;
+}
+
+/* Receive an InstallSnapshot message. */
+TEST(recv, installSnapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_message message;
+    uint8_t snapshot_data[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+    int rv;
+
+    message.type = RAFT_IO_INSTALL_SNAPSHOT;
+    message.install_snapshot.term = 2;
+    message.install_snapshot.last_index = 123;
+    message.install_snapshot.last_term = 1;
+    raft_configuration_init(&message.install_snapshot.conf);
+    rv = raft_configuration_add(&message.install_snapshot.conf, 1, "1",
+                                RAFT_VOTER);
+    munit_assert_int(rv, ==, 0);
+    message.install_snapshot.data.len = sizeof snapshot_data;
+    message.install_snapshot.data.base = snapshot_data;
+
+    PEER_SEND(&message);
+    RECV(&message);
+
+    raft_configuration_close(&message.install_snapshot.conf);
+
+    return MUNIT_OK;
+}
+
+/* Receive a TimeoutNow message. */
+TEST(recv, timeoutNow, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_message message;
+    message.type = RAFT_IO_TIMEOUT_NOW;
+    message.timeout_now.term = 3;
+    message.timeout_now.last_log_index = 123;
+    message.timeout_now.last_log_term = 2;
+    PEER_SEND(&message);
+    RECV(&message);
+    return MUNIT_OK;
+}
+
+/* The handshake fails because of an unexpected protocon version. */
+TEST(recv, badProtocol, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t handshake[] = {
+        6, 6, 6, 0, 0, 0, 0, 0, /* Protocol */
+        1, 0, 0, 0, 0, 0, 0, 0, /* Server ID */
+        2, 0, 0, 0, 0, 0, 0, 0  /* Address length */
+    };
+    TCP_CLIENT_CONNECT(9001);
+    TCP_CLIENT_SEND(handshake, sizeof handshake);
+    LOOP_RUN(2);
+    return MUNIT_OK;
+}
+
+/* A message can't have zero length. */
+TEST(recv, badSize, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t header[] = {
+        1, 0, 0, 0, 0, 0, 0, 0, /* Message type */
+        0, 0, 0, 0, 0, 0, 0, 0  /* Message size */
+    };
+    PEER_HANDSHAKE;
+    TCP_CLIENT_SEND(header, sizeof header);
+    LOOP_RUN(2);
+    return MUNIT_OK;
+}
+
+/* A message with a bad type causes the connection to be aborted. */
+TEST(recv, badType, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t header[] = {
+        1, 2, 3, 4, 5, 6, 7, 8, /* Message type */
+        0, 0, 0, 0, 0, 0, 0, 0  /* Message size */
+    };
+    PEER_HANDSHAKE;
+    TCP_CLIENT_SEND(header, sizeof header);
+    LOOP_RUN(2);
+    return MUNIT_OK;
+}
+
+/* The backend is closed just before accepting a new connection. */
+TEST(recv, closeBeforeAccept, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t header[] = {
+        1, 2, 3, 4, 5, 6, 7, 8, /* Message type */
+        0, 0, 0, 0, 0, 0, 0, 0  /* Message size */
+    };
+    PEER_HANDSHAKE;
+    TCP_CLIENT_SEND(header, sizeof header);
+    LOOP_RUN(1);
+    TEAR_DOWN_UV;
+    return MUNIT_OK;
+}
+
+/* The backend is closed after receiving the header of an AppendEntries
+ * message. */
+TEST(recv, closeAfterAppendEntriesHeader, setUp, tearDown, 0, NULL)
+{
+    /* TODO */
+    return MUNIT_SKIP;
+}
diff --git a/test/raft/integration/test_uv_send.c b/test/raft/integration/test_uv_send.c
new file mode 100644
index 000000000..056944a4d
--- /dev/null
+++ b/test/raft/integration/test_uv_send.c
@@ -0,0 +1,413 @@
+#include <unistd.h>
+
+#include "../lib/runner.h"
+#include "../lib/tcp.h"
+#include "../lib/uv.h"
+
+/******************************************************************************
+ *
+ * Fixture with a libuv-based raft_io instance and some pre-set messages.
+ *
+ *****************************************************************************/
+
+#define N_MESSAGES 5
+
+struct fixture
+{
+    FIXTURE_UV_DEPS;
+    FIXTURE_TCP_SERVER;
+    FIXTURE_UV;
+    struct raft_message messages[N_MESSAGES];
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+struct result
+{
+    int status;
+    bool done;
+};
+
+static void sendCbAssertResult(struct raft_io_send *req, int status)
+{
+    struct result *result = req->data;
+    munit_assert_int(status, ==, result->status);
+    result->done = true;
+}
+
+/* Get I'th fixture's message. */
+#define MESSAGE(I) (&f->messages[I])
+
+/* Submit a send request for the I'th fixture's message. */
+#define SEND_SUBMIT(I, RV, STATUS)                                         \
+    struct raft_io_send _req##I;                                           \
+    struct result _result##I = {STATUS, false};                            \
+    int _rv##I;                                                            \
+    _req##I.data = &_result##I;                                            \
+    _rv##I =                                                               \
+        f->io.send(&f->io, &_req##I, &f->messages[I], sendCbAssertResult); \
+    munit_assert_int(_rv##I, ==, RV)
+
+/* Wait for the submit request of the I'th message to finish. */
+#define SEND_WAIT(I) LOOP_RUN_UNTIL(&_result##I.done)
+
+/* Submit a send request for the I'th fixture's message and wait for the
+ * operation to successfully complete. */
+#define SEND(I)                                     \
+    do {                                            \
+        SEND_SUBMIT(I, 0 /* rv */, 0 /* status */); \
+        SEND_WAIT(I);                               \
+    } while (0)
+
+/* Submit a send request and assert that it fails synchronously with the
+ * given error code and message. */
+#define SEND_ERROR(I, RV, ERRMSG)                                    \
+    do {                                                             \
+        SEND_SUBMIT(I, RV, 0 /* status */);                          \
+        /* munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \
+    } while (0)
+
+/* Submit a send request and wait for the operation to fail with the given code
+ * and message. */
+#define SEND_FAILURE(I, STATUS, ERRMSG)                             \
+    do {                                                            \
+        SEND_SUBMIT(I, 0 /* rv */, STATUS);                         \
+        SEND_WAIT(I);                                               \
+        /*munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \
+    } while (0)
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUpDeps(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_UV_DEPS;
+    SETUP_TCP_SERVER;
+    f->io.data = f;
+    return f;
+}
+
+static void tearDownDeps(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_TCP_SERVER;
+    TEAR_DOWN_UV_DEPS;
+    free(f);
+}
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = setUpDeps(params, user_data);
+    unsigned i;
+    SETUP_UV;
+    raft_uv_set_connect_retry_delay(&f->io, 1);
+    for (i = 0; i < N_MESSAGES; i++) {
+        struct raft_message *message = &f->messages[i];
+        message->type = RAFT_IO_REQUEST_VOTE;
+        message->server_id = 1;
+        message->server_address = f->server.address;
+    }
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_UV;
+    tearDownDeps(f);
+}
+
+/******************************************************************************
+ *
+ * raft_io->send()
+ *
+ *****************************************************************************/
+
+SUITE(send)
+
+/* The first time a request is sent to a server a connection attempt is
+ * triggered. If the connection succeeds the request gets written out. */
+TEST(send, first, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    SEND(0);
+    return MUNIT_OK;
+}
+
+/* The second time a request is sent it re-uses the connection that was already
+ * established */
+TEST(send, second, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    SEND(0);
+    SEND(0);
+    return MUNIT_OK;
+}
+
+/* Submit a few send requests in parallel. */
+TEST(send, parallel, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    SEND_SUBMIT(0 /* message */, 0 /* rv */, 0 /* status */);
+    SEND_SUBMIT(1 /* message */, 0 /* rv */, 0 /* status */);
+    SEND_WAIT(0);
+    SEND_WAIT(1);
+    return MUNIT_OK;
+}
+
+/* Send a request vote result message. */
+TEST(send, voteResult, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    MESSAGE(0)->type = RAFT_IO_REQUEST_VOTE_RESULT;
+    SEND(0);
+    return MUNIT_OK;
+}
+
+/* Send an append entries message. */
+TEST(send, appendEntries, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entries[2];
+    entries[0].buf.base = raft_malloc(16);
+    entries[0].buf.len = 16;
+    entries[1].buf.base = raft_malloc(8);
+    entries[1].buf.len = 8;
+
+    MESSAGE(0)->type = RAFT_IO_APPEND_ENTRIES;
+    MESSAGE(0)->append_entries.entries = entries;
+    MESSAGE(0)->append_entries.n_entries = 2;
+
+    SEND(0);
+
+    raft_free(entries[0].buf.base);
+    raft_free(entries[1].buf.base);
+
+    return MUNIT_OK;
+}
+
+/* Send an append entries message with zero entries (i.e. a heartbeat). */
+TEST(send, heartbeat, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    MESSAGE(0)->type = RAFT_IO_APPEND_ENTRIES;
+    MESSAGE(0)->append_entries.entries = NULL;
+    MESSAGE(0)->append_entries.n_entries = 0;
+    SEND(0);
+    return MUNIT_OK;
+}
+
+/* Send an append entries result message. */
+TEST(send, appendEntriesResult, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    MESSAGE(0)->type = RAFT_IO_APPEND_ENTRIES_RESULT;
+    SEND(0);
+    return MUNIT_OK;
+}
+
+/* Send an install snapshot message. */
+TEST(send, installSnapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_install_snapshot *p = &MESSAGE(0)->install_snapshot;
+    int rv;
+
+    MESSAGE(0)->type = RAFT_IO_INSTALL_SNAPSHOT;
+
+    raft_configuration_init(&p->conf);
+    rv = raft_configuration_add(&p->conf, 1, "1", RAFT_VOTER);
+    munit_assert_int(rv, ==, 0);
+
+    p->data.len = 8;
+    p->data.base = raft_malloc(p->data.len);
+
+    SEND(0);
+
+    raft_configuration_close(&p->conf);
+    raft_free(p->data.base);
+
+    return MUNIT_OK;
+}
+
+/* A connection attempt fails asynchronously after the connect function
+ * returns. */
+TEST(send, noConnection, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    MESSAGE(0)->server_address = "127.0.0.1:123456";
+    SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
+    TEAR_DOWN_UV;
+    return MUNIT_OK;
+}
+
+/* The message has an invalid IPv4 address. */
+TEST(send, badAddress, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    MESSAGE(0)->server_address = "1";
+    SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
+    TEAR_DOWN_UV;
+    return MUNIT_OK;
+}
+
+/* Make sure UvSend doesn't use a stale connection for a certain server id
+ * by first sending a message to a valid address and then sending a message to
+ * an invalid address, making sure the valid connection is not reused.
+ * Afterwards assert that a send to the correct address still succeeds. */
+TEST(send, changeToUnconnectedAddress, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+
+    /* Send a message to a server and a connected address */
+    SEND(0);
+
+    /* Send a message to the same server, but update the address to an
+     * unconnected address and assert it fails. */
+    munit_assert_ullong(MESSAGE(0)->server_id, ==, MESSAGE(1)->server_id);
+    MESSAGE(1)->server_address = "127.0.0.2:1";
+    SEND_SUBMIT(1 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
+
+    /* Send another message to the same server and connected address */
+    munit_assert_ullong(MESSAGE(0)->server_id, ==, MESSAGE(2)->server_id);
+    SEND(2);
+
+    /* Send another message to the same server and connected address */
+    munit_assert_ullong(MESSAGE(0)->server_id, ==, MESSAGE(3)->server_id);
+    SEND(3);
+
+    TEAR_DOWN_UV;
+    return MUNIT_OK;
+}
+
+/* The message has an invalid type. */
+TEST(send, badMessage, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    MESSAGE(0)->type = 666;
+    SEND_ERROR(0, RAFT_MALFORMED, "");
+    return MUNIT_OK;
+}
+
+/* Old send requests that have accumulated and could not yet be sent are
+ * progressively evicted. */
+TEST(send, evictOldPending, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    TCP_SERVER_STOP;
+    SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_NOCONNECTION /* status */);
+    SEND_SUBMIT(1 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
+    SEND_SUBMIT(2 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
+    SEND_SUBMIT(3 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
+    SEND_WAIT(0);
+    TEAR_DOWN_UV;
+    return MUNIT_OK;
+}
+
+/* After the connection is established the peer dies and then comes back a
+ * little bit later. */
+TEST(send, reconnectAfterWriteError, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    int socket;
+    SEND(0);
+    socket = TcpServerAccept(&f->server);
+    close(socket);
+    SEND_FAILURE(0, RAFT_IOERR, "");
+    SEND(0);
+    return MUNIT_OK;
+}
+
+/* After the connection is established the peer dies and then comes back a
+ * little bit later. At the time the peer died there where several writes
+ * pending. */
+TEST(send, reconnectAfterMultipleWriteErrors, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    int socket;
+    signal(SIGPIPE, SIG_IGN);
+    SEND(0);
+    socket = TcpServerAccept(&f->server);
+    close(socket);
+    SEND_SUBMIT(1 /* message */, 0 /* rv */, RAFT_IOERR /* status */);
+    SEND_SUBMIT(2 /* message */, 0 /* rv */, RAFT_IOERR /* status */);
+    SEND_WAIT(1);
+    SEND_WAIT(2);
+    SEND(3);
+    return MUNIT_OK;
+}
+
+static char *oomHeapFaultDelay[] = {"0", "1", "2", "3", "4", NULL};
+static char *oomHeapFaultRepeat[] = {"1", NULL};
+
+static MunitParameterEnum oomParams[] = {
+    {TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay},
+    {TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat},
+    {NULL, NULL},
+};
+
+/* Out of memory conditions. */
+TEST(send, oom, setUp, tearDown, 0, oomParams)
+{
+    struct fixture *f = data;
+    HEAP_FAULT_ENABLE;
+    SEND_ERROR(0, RAFT_NOMEM, "");
+    return MUNIT_OK;
+}
+
+static char *oomAsyncHeapFaultDelay[] = {"2", NULL};
+static char *oomAsyncHeapFaultRepeat[] = {"1", NULL};
+
+static MunitParameterEnum oomAsyncParams[] = {
+    {TEST_HEAP_FAULT_DELAY, oomAsyncHeapFaultDelay},
+    {TEST_HEAP_FAULT_REPEAT, oomAsyncHeapFaultRepeat},
+    {NULL, NULL},
+};
+
+/* Transient out of memory error happening after send() has returned. */
+TEST(send, oomAsync, setUp, tearDown, 0, oomAsyncParams)
+{
+    struct fixture *f = data;
+    SEND(0);
+    return MUNIT_OK;
+}
+
+/* The backend gets closed while there is a pending write. */
+TEST(send, closeDuringWrite, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry entry;
+
+    /* Set a very large message that is likely to fill the socket buffer.
+     * TODO: figure a more deterministic way to choose the value. */
+    entry.buf.len = 1024 * 1024 * 8;
+    entry.buf.base = raft_malloc(entry.buf.len);
+
+    MESSAGE(0)->type = RAFT_IO_APPEND_ENTRIES;
+    MESSAGE(0)->append_entries.entries = &entry;
+    MESSAGE(0)->append_entries.n_entries = 1;
+
+    SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
+    TEAR_DOWN_UV;
+
+    raft_free(entry.buf.base);
+
+    return MUNIT_OK;
+}
+
+/* The backend gets closed while there is a pending connect request. */
+TEST(send, closeDuringConnection, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    SEND_SUBMIT(0 /* message */, 0 /* rv */, RAFT_CANCELED /* status */);
+    TEAR_DOWN_UV;
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_uv_set_term.c b/test/raft/integration/test_uv_set_term.c
new file mode 100644
index 000000000..7329b4568
--- /dev/null
+++ b/test/raft/integration/test_uv_set_term.c
@@ -0,0 +1,242 @@
+#include "../../../src/raft.h"
+#include "../../../src/raft/byte.h"
+#include "../lib/runner.h"
+#include "../lib/uv.h"
+
+/******************************************************************************
+ *
+ * Fixture with a libuv-based raft_io instance.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_UV_DEPS;
+    FIXTURE_UV;
+    bool closed;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+static void closeCb(struct raft_io *io)
+{
+    struct fixture *f = io->data;
+    f->closed = true;
+}
+
+/* Invoke raft_uv_init() and assert that no error occurs. */
+#define INIT                                                         \
+    do {                                                             \
+        int _rv;                                                     \
+        _rv = raft_uv_init(&f->io, &f->loop, f->dir, &f->transport); \
+        munit_assert_int(_rv, ==, 0);                                \
+        _rv = f->io.init(&f->io, 1, "1");                            \
+        munit_assert_int(_rv, ==, 0);                                \
+    } while (0)
+
+/* Invoke raft_io->close(). */
+#define CLOSE                         \
+    do {                              \
+        f->io.close(&f->io, closeCb); \
+        LOOP_RUN_UNTIL(&f->closed);   \
+        raft_uv_close(&f->io);        \
+    } while (0)
+
+/* Invoke f->io->set_term() and assert that no error occurs. */
+#define SET_TERM(TERM)                      \
+    do {                                    \
+        int _rv;                            \
+        _rv = f->io.set_term(&f->io, TERM); \
+        munit_assert_int(_rv, ==, 0);       \
+    } while (0)
+
+/* Invoke f->io->set_term() and assert that the given error code is returned and
+ * the given error message set. */
+#define SET_TERM_ERROR(TERM, RV, ERRMSG)                          \
+    do {                                                          \
+        int _rv;                                                  \
+        _rv = f->io.set_term(&f->io, TERM);                       \
+        munit_assert_int(_rv, ==, RV);                            \
+        munit_assert_string_equal(f->io.errmsg_(&f->io), ERRMSG); \
+    } while (0)
+
+/* Write either the metadata1 or metadata2 file, filling it with the given
+ * values. */
+#define WRITE_METADATA_FILE(N, FORMAT, VERSION, TERM, VOTED_FOR) \
+    {                                                            \
+        uint8_t buf[8 * 4];                                      \
+        void *cursor = buf;                                      \
+        char filename[strlen("metadataN") + 1];                  \
+        sprintf(filename, "metadata%d", N);                      \
+        bytePut64(&cursor, FORMAT);                              \
+        bytePut64(&cursor, VERSION);                             \
+        bytePut64(&cursor, TERM);                                \
+        bytePut64(&cursor, VOTED_FOR);                           \
+        DirWriteFile(f->dir, filename, buf, sizeof buf);         \
+    }
+
+/* Assert that the content of either the metadata1 or metadata2 file match the
+ * given values. */
+#define ASSERT_METADATA_FILE(N, VERSION, TERM, VOTED_FOR)    \
+    {                                                        \
+        uint8_t buf2[8 * 4];                                 \
+        const void *cursor = buf2;                           \
+        char filename[strlen("metadataN") + 1];              \
+        sprintf(filename, "metadata%d", N);                  \
+        DirReadFile(f->dir, filename, buf2, sizeof buf2);    \
+        munit_assert_int(byteGet64(&cursor), ==, 1);         \
+        munit_assert_int(byteGet64(&cursor), ==, VERSION);   \
+        munit_assert_int(byteGet64(&cursor), ==, TERM);      \
+        munit_assert_int(byteGet64(&cursor), ==, VOTED_FOR); \
+    }
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUpDeps(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_UV_DEPS;
+    f->io.data = f;
+    f->closed = false;
+    return f;
+}
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = setUpDeps(params, user_data);
+    INIT;
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    CLOSE;
+    TEAR_DOWN_UV_DEPS;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * raft_io->set_term()
+ *
+ *****************************************************************************/
+
+SUITE(set_term)
+
+/* The very first time set_term() is called, the metadata1 file gets written. */
+TEST(set_term, first, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    SET_TERM(1);
+    ASSERT_METADATA_FILE(1, 1, 1, 0);
+    munit_assert_false(DirHasFile(f->dir, "metadata2"));
+    return MUNIT_OK;
+}
+
+/* The second time set_term() is called, the metadata2 file gets written. */
+TEST(set_term, second, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    SET_TERM(1);
+    SET_TERM(2);
+    ASSERT_METADATA_FILE(1, 1, 1, 0);
+    ASSERT_METADATA_FILE(2, 2, 2, 0);
+    return MUNIT_OK;
+}
+
+/* The third time set_term() is called, the metadata1 file gets overwritten. */
+TEST(set_term, third, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    SET_TERM(1);
+    SET_TERM(2);
+    SET_TERM(3);
+    ASSERT_METADATA_FILE(1, 3, 3, 0);
+    ASSERT_METADATA_FILE(2, 2, 2, 0);
+    return MUNIT_OK;
+}
+
+/* The fourth time set_term() is called, the metadata2 file gets overwritten. */
+TEST(set_term, fourth, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    SET_TERM(1);
+    SET_TERM(2);
+    SET_TERM(3);
+    SET_TERM(4);
+    ASSERT_METADATA_FILE(1, 3, 3, 0);
+    ASSERT_METADATA_FILE(2, 4, 4, 0);
+    return MUNIT_OK;
+}
+
+/* If the data directory has a single metadata1 file, the first time set_data()
+ * is called, the second metadata file gets created. */
+TEST(set_term, metadataOneExists, setUpDeps, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    WRITE_METADATA_FILE(1, /* Metadata file index                  */
+                        1, /* Format                               */
+                        1, /* Version                              */
+                        1, /* Term                                 */
+                        0 /* Voted for                            */);
+    INIT;
+    SET_TERM(2);
+    ASSERT_METADATA_FILE(1, 1, 1, 0);
+    ASSERT_METADATA_FILE(2, 2, 2, 0);
+    return MUNIT_OK;
+}
+
+/* The data directory has both metadata files, but metadata1 is greater. */
+TEST(set_term, metadataOneIsGreater, setUpDeps, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    WRITE_METADATA_FILE(1, /* Metadata file index                  */
+                        1, /* Format                               */
+                        3, /* Version                              */
+                        3, /* Term                                 */
+                        0 /* Voted for                            */);
+    WRITE_METADATA_FILE(2, /* Metadata file index                  */
+                        1, /* Format                               */
+                        2, /* Version                              */
+                        2, /* Term                                 */
+                        0 /* Voted for                            */);
+    INIT;
+    SET_TERM(4);
+    ASSERT_METADATA_FILE(1 /* n */, 3 /* version */, 3 /* term */,
+                         0 /* voted for */);
+    ASSERT_METADATA_FILE(2 /* n */, 4 /* version */, 4 /* term */,
+                         0 /* voted for */);
+    return MUNIT_OK;
+}
+
+/* The data directory has both metadata files, but metadata2 is greater. */
+TEST(set_term, metadataTwoIsGreater, setUpDeps, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    WRITE_METADATA_FILE(1, /* Metadata file index                  */
+                        1, /* Format                               */
+                        1, /* Version                              */
+                        1, /* Term                                 */
+                        0 /* Voted for                            */);
+    WRITE_METADATA_FILE(2, /* Metadata file index                  */
+                        1, /* Format                               */
+                        2, /* Version                              */
+                        2, /* Term                                 */
+                        0 /* Voted for                            */);
+    INIT;
+    SET_TERM(2);
+    ASSERT_METADATA_FILE(1 /* n */, 3 /* version */, 2 /* term */,
+                         0 /* voted for */);
+    ASSERT_METADATA_FILE(2 /* n */, 2 /* version */, 2 /* term */,
+                         0 /* voted for */);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_uv_snapshot_put.c b/test/raft/integration/test_uv_snapshot_put.c
new file mode 100644
index 000000000..5e33b3e2a
--- /dev/null
+++ b/test/raft/integration/test_uv_snapshot_put.c
@@ -0,0 +1,315 @@
+#include <unistd.h>
+
+#include "../lib/runner.h"
+#include "../lib/tcp.h"
+#include "../lib/uv.h"
+#include "append_helpers.h"
+
+/******************************************************************************
+ *
+ * Fixture with a libuv-based raft_io instance.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_UV_DEPS;
+    FIXTURE_UV;
+    bool closed;
+    int count;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+struct snapshot
+{
+    raft_term term;
+    raft_index index;
+    uint64_t data;
+    bool done;
+};
+
+static void snapshotPutCbAssertResult(struct raft_io_snapshot_put *req,
+                                      int status)
+{
+    struct result *result = req->data;
+    munit_assert_int(status, ==, result->status);
+    result->done = true;
+}
+
+static void snapshotGetCbAssertResult(struct raft_io_snapshot_get *req,
+                                      struct raft_snapshot *snapshot,
+                                      int status)
+{
+    struct snapshot *expect = req->data;
+    munit_assert_int(status, ==, 0);
+    munit_assert_ptr_not_null(snapshot);
+    munit_assert_int(snapshot->term, ==, expect->term);
+    munit_assert_int(snapshot->index, ==, snapshot->index);
+    expect->done = true;
+    raft_configuration_close(&snapshot->configuration);
+    raft_free(snapshot->bufs[0].base);
+    raft_free(snapshot->bufs);
+    raft_free(snapshot);
+}
+
+/* Submit a request to truncate the log at N */
+#define TRUNCATE(N)                      \
+    {                                    \
+        int _rv;                         \
+        _rv = f->io.truncate(&f->io, N); \
+        munit_assert_int(_rv, ==, 0);    \
+    }
+
+#define SNAPSHOT_PUT_REQ(TRAILING, INDEX, RV, STATUS)              \
+    struct raft_snapshot _snapshot;                                \
+    struct raft_buffer _snapshot_buf;                              \
+    uint64_t _snapshot_data;                                       \
+    struct raft_io_snapshot_put _req;                              \
+    struct result _result = {STATUS, false, NULL};                 \
+    int _rv;                                                       \
+    _snapshot.term = 1;                                            \
+    _snapshot.index = INDEX;                                       \
+    raft_configuration_init(&_snapshot.configuration);             \
+    _rv = raft_configuration_add(&_snapshot.configuration, 1, "1", \
+                                 RAFT_STANDBY);                    \
+    munit_assert_int(_rv, ==, 0);                                  \
+    _snapshot.bufs = &_snapshot_buf;                               \
+    _snapshot.n_bufs = 1;                                          \
+    _snapshot_buf.base = &_snapshot_data;                          \
+    _snapshot_buf.len = sizeof _snapshot_data;                     \
+    _req.data = &_result;                                          \
+    _rv = f->io.snapshot_put(&f->io, TRAILING, &_req, &_snapshot,  \
+                             snapshotPutCbAssertResult);           \
+    munit_assert_int(_rv, ==, RV)
+
+/* Submit a snapshot put request for the given snapshot and wait for the
+ * operation to successfully complete. */
+#define SNAPSHOT_PUT(TRAILING, INDEX)                                  \
+    do {                                                               \
+        SNAPSHOT_PUT_REQ(TRAILING, INDEX, 0 /* rv */, 0 /* status */); \
+        LOOP_RUN_UNTIL(&_result.done);                                 \
+        raft_configuration_close(&_snapshot.configuration);            \
+    } while (0)
+
+/* Submit a snapshot put request and assert that it fails synchronously with the
+ * given error code and message. */
+#define SNAPSHOT_PUT_ERROR(SNAPSHOT, TRAILING, RV, ERRMSG)           \
+    do {                                                             \
+        SNAPSHOT_PUT_REQ(SNAPSHOT, TRAILING, RV, 0 /* status */);    \
+        /* munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \
+    } while (0)
+
+/* Submit a snapshot put request and wait for the operation to fail with the
+ * given code and message. */
+#define SNAPSHOT_PUT_FAILURE(STATUS, ERRMSG)                        \
+    do {                                                            \
+        SNAPSHOT_PUT_REQ(0 /* rv */, STATUS);                       \
+        LOOP_RUN_UNTIL(&_result.done);                              \
+        /*munit_assert_string_equal(f->transport.errmsg, ERRMSG);*/ \
+    } while (0)
+
+/* Use raft_io->snapshot_get to load the last snapshot and compare it with the
+ * given parameters. */
+#define ASSERT_SNAPSHOT(TERM, INDEX, DATA)                                  \
+    do {                                                                    \
+        struct raft_io_snapshot_get _req;                                   \
+        struct snapshot _expect = {TERM, INDEX, DATA, false};               \
+        int _rv;                                                            \
+        _req.data = &_expect;                                               \
+        _rv = f->io.snapshot_get(&f->io, &_req, snapshotGetCbAssertResult); \
+        munit_assert_int(_rv, ==, 0);                                       \
+        LOOP_RUN_UNTIL(&_expect.done);                                      \
+    } while (0)
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUpDeps(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_UV_DEPS;
+    f->io.data = f;
+    f->closed = false;
+    return f;
+}
+
+static void tearDownDeps(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_UV_DEPS;
+    free(f);
+}
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = setUpDeps(params, user_data);
+    SETUP_UV;
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_UV;
+    tearDownDeps(f);
+}
+
+/******************************************************************************
+ *
+ * raft_io->snapshot_put
+ *
+ *****************************************************************************/
+
+SUITE(snapshot_put)
+
+/* Put the first snapshot. */
+TEST(snapshot_put, first, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    SNAPSHOT_PUT(10, /* trailing */
+                 1   /* index */
+    );
+    ASSERT_SNAPSHOT(1, 1, 1);
+    return MUNIT_OK;
+}
+
+/* If the number of closed entries is less than the given trailing amount, no
+ * segment is deleted. */
+TEST(snapshot_put, entriesLessThanTrailing, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    unsigned i;
+    raft_uv_set_segment_size(
+        &f->io, 4096); /* Lower the number of block to force finalizing */
+
+    for (i = 0; i < 40; i++) {
+        APPEND(10, 8);
+    }
+
+    SNAPSHOT_PUT(128, /* trailing */
+                 100  /* index */
+    );
+
+    munit_assert_true(DirHasFile(f->dir, "0000000000000001-0000000000000150"));
+    munit_assert_true(DirHasFile(f->dir, "0000000000000151-0000000000000300"));
+
+    return MUNIT_OK;
+}
+
+/* If the number of closed entries is greater than the given trailing amount,
+ * closed segments that are fully past the trailing amount get deleted. */
+TEST(snapshot_put, entriesMoreThanTrailing, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    unsigned i;
+    raft_uv_set_segment_size(
+        &f->io, 4096); /* Lower the number of block to force finalizing */
+
+    for (i = 0; i < 40; i++) {
+        APPEND(10, 8);
+    }
+
+    SNAPSHOT_PUT(128, /* trailing */
+                 280  /* index */
+    );
+
+    munit_assert_false(DirHasFile(f->dir, "0000000000000001-0000000000000150"));
+    munit_assert_true(DirHasFile(f->dir, "0000000000000151-0000000000000300"));
+
+    return MUNIT_OK;
+}
+
+/* Request to install a snapshot. */
+TEST(snapshot_put, install, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(4, 8);
+    SNAPSHOT_PUT(0, /* trailing */
+                 1  /* index */
+    );
+    return MUNIT_OK;
+}
+
+/* Request to install a snapshot without compression. */
+TEST(snapshot_put, installNoCompression, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    raft_uv_set_snapshot_compression(&f->io, false);
+    APPEND(4, 8);
+    SNAPSHOT_PUT(0, /* trailing */
+                 1  /* index */
+    );
+    return MUNIT_OK;
+}
+
+/* Request to install a snapshot, no previous entry is present. */
+TEST(snapshot_put, installWithoutPreviousEntries, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    SNAPSHOT_PUT(0, /* trailing */
+                 1  /* index */
+    );
+    return MUNIT_OK;
+}
+
+/* Request to install a couple of snapshots in a row, no previous entry is
+ * present. */
+TEST(snapshot_put,
+     installMultipleWithoutPreviousEntries,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+    SNAPSHOT_PUT(0, /* trailing */
+                 1  /* index */
+    );
+    SNAPSHOT_PUT(0, /* trailing */
+                 3  /* index */
+    );
+    SNAPSHOT_PUT(0,   /* trailing */
+                 1337 /* index */
+    );
+    return MUNIT_OK;
+}
+
+/* Request to install a couple of snapshots in a row, AppendEntries Requests
+ * happen before, meanwhile and after */
+TEST(snapshot_put,
+     installMultipleAppendEntriesInBetween,
+     setUp,
+     tearDown,
+     0,
+     NULL)
+{
+    struct fixture *f = data;
+
+    APPEND_SUBMIT(0, 256, 8);
+    APPEND_SUBMIT(1, 256, 8);
+    SNAPSHOT_PUT(0, /* trailing */
+                 1  /* index */
+    );
+    APPEND_WAIT(0);
+    APPEND_WAIT(1);
+    APPEND_SUBMIT(2, 256, 8);
+    APPEND_SUBMIT(3, 256, 8);
+    SNAPSHOT_PUT(0,  /* trailing */
+                 100 /* index */
+    );
+    APPEND_WAIT(2);
+    APPEND_WAIT(3);
+    APPEND_SUBMIT(4, 256, 8);
+    APPEND_SUBMIT(5, 256, 8);
+    APPEND_WAIT(4);
+    APPEND_WAIT(5);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_uv_tcp_connect.c b/test/raft/integration/test_uv_tcp_connect.c
new file mode 100644
index 000000000..7efc68c60
--- /dev/null
+++ b/test/raft/integration/test_uv_tcp_connect.c
@@ -0,0 +1,358 @@
+#include "../../../src/raft.h"
+#include "../../../src/raft.h"
+#include "../lib/addrinfo.h"
+#include "../lib/heap.h"
+#include "../lib/loop.h"
+#include "../lib/runner.h"
+#include "../lib/tcp.h"
+
+/******************************************************************************
+ *
+ * Fixture with a TCP-based raft_uv_transport.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_HEAP;
+    FIXTURE_LOOP;
+    FIXTURE_TCP_SERVER;
+    struct raft_uv_transport transport;
+    bool closed;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+struct result
+{
+    int status;
+    bool done;
+};
+
+static void closeCb(struct raft_uv_transport *transport)
+{
+    struct fixture *f = transport->data;
+    f->closed = true;
+}
+
+static void connectCbAssertResult(struct raft_uv_connect *req,
+                                  struct uv_stream_s *stream,
+                                  int status)
+{
+    struct result *result = req->data;
+    munit_assert_int(status, ==, result->status);
+    if (status == 0) {
+        uv_close((struct uv_handle_s *)stream, (uv_close_cb)raft_free);
+    }
+    result->done = true;
+}
+
+#define INIT                                                         \
+    do {                                                             \
+        int _rv;                                                     \
+        _rv = f->transport.init(&f->transport, 1, "127.0.0.1:9000"); \
+        munit_assert_int(_rv, ==, 0);                                \
+        f->transport.data = f;                                       \
+        f->closed = false;                                           \
+    } while (0)
+
+#define CLOSE_SUBMIT               \
+    munit_assert_false(f->closed); \
+    f->transport.close(&f->transport, closeCb);
+
+#define CLOSE_WAIT LOOP_RUN_UNTIL(&f->closed)
+#define CLOSE     \
+    CLOSE_SUBMIT; \
+    CLOSE_WAIT
+
+#define CONNECT_REQ(ID, ADDRESS, RV, STATUS)                      \
+    struct raft_uv_connect _req;                                  \
+    struct result _result = {STATUS, false};                      \
+    int _rv;                                                      \
+    _req.data = &_result;                                         \
+    _rv = f->transport.connect(&f->transport, &_req, ID, ADDRESS, \
+                               connectCbAssertResult);            \
+    munit_assert_int(_rv, ==, RV)
+
+/* Try to submit a connect request and assert that the given error code and
+ * message are returned. */
+#define CONNECT_ERROR(ID, ADDRESS, RV, ERRMSG)                  \
+    {                                                           \
+        CONNECT_REQ(ID, ADDRESS, RV /* rv */, 0 /* status */);  \
+        munit_assert_string_equal(f->transport.errmsg, ERRMSG); \
+    }
+
+/* Submit a connect request with the given parameters and wait for the operation
+ * to successfully complete. */
+#define CONNECT(ID, ADDRESS)                                  \
+    {                                                         \
+        CONNECT_REQ(ID, ADDRESS, 0 /* rv */, 0 /* status */); \
+        LOOP_RUN_UNTIL(&_result.done);                        \
+    }
+
+/* Submit a connect request with the given parameters and wait for the operation
+ * to fail with the given code and message. */
+#define CONNECT_FAILURE(ID, ADDRESS, STATUS, ERRMSG)            \
+    {                                                           \
+        CONNECT_REQ(ID, ADDRESS, 0 /* rv */, STATUS);           \
+        LOOP_RUN_UNTIL(&_result.done);                          \
+        munit_assert_string_equal(f->transport.errmsg, ERRMSG); \
+    }
+
+/* Submit a connect request with the given parameters, close the transport after
+ * N loop iterations and assert that the request got canceled. */
+#define CONNECT_CLOSE(ID, ADDRESS, N)                        \
+    {                                                        \
+        CONNECT_REQ(ID, ADDRESS, 0 /* rv */, RAFT_CANCELED); \
+        LOOP_RUN(N);                                         \
+        CLOSE_SUBMIT;                                        \
+        munit_assert_false(_result.done);                    \
+        LOOP_RUN_UNTIL(&_result.done);                       \
+        CLOSE_WAIT;                                          \
+    }
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUpDeps(const MunitParameter params[],
+                       MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    int rv;
+    SET_UP_ADDRINFO;
+    SET_UP_HEAP;
+    SETUP_LOOP;
+    SETUP_TCP_SERVER;
+    f->transport.version = 1;
+    rv = raft_uv_tcp_init(&f->transport, &f->loop);
+    munit_assert_int(rv, ==, 0);
+    return f;
+}
+
+static void tearDownDeps(void *data)
+{
+    struct fixture *f = data;
+    LOOP_STOP;
+    raft_uv_tcp_close(&f->transport);
+    TEAR_DOWN_TCP_SERVER;
+    TEAR_DOWN_LOOP;
+    TEAR_DOWN_HEAP;
+    TEAR_DOWN_ADDRINFO;
+    free(f);
+}
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = setUpDeps(params, user_data);
+    INIT;
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    CLOSE;
+    tearDownDeps(f);
+}
+
+/******************************************************************************
+ *
+ * raft_uv_transport->connect()
+ *
+ *****************************************************************************/
+
+#define BOGUS_ADDRESS "127.0.0.1:6666"
+#define INVALID_ADDRESS "500.0.0.1:6666"
+
+SUITE(tcp_connect)
+
+/* Successfully connect to the peer by IP */
+TEST(tcp_connect, first, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    CONNECT(2, TCP_SERVER_ADDRESS);
+    return MUNIT_OK;
+}
+
+/* Successfully connect to the peer by hostname */
+TEST(tcp_connect, connectByName, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    char host_adress[256];
+    sprintf(host_adress, "localhost:%d", TCP_SERVER_PORT);
+    CONNECT(2, host_adress);
+    return MUNIT_OK;
+}
+
+/* Successfully connect to the peer by first IP  */
+TEST(tcp_connect, firstIP, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    const struct AddrinfoResult results[] = {{"127.0.0.1", TCP_SERVER_PORT},
+                                             {"192.0.2.0", 6666}};
+    AddrinfoInjectSetResponse(0, 2, results);
+    CONNECT(2, "any-host");
+    return MUNIT_OK;
+}
+
+/* Successfully connect to the peer by second IP  */
+TEST(tcp_connect, secondIP, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    const struct AddrinfoResult results[] = {{"127.0.0.1", .6666},
+                                             {"127.0.0.1", TCP_SERVER_PORT}};
+
+    AddrinfoInjectSetResponse(0, 2, results);
+    CONNECT(2, "any-host");
+    return MUNIT_OK;
+}
+
+/* The peer has shutdown */
+TEST(tcp_connect, refused, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    TCP_SERVER_STOP;
+    CONNECT_FAILURE(2, BOGUS_ADDRESS, RAFT_NOCONNECTION,
+                    "uv_tcp_connect(): connection refused");
+    return MUNIT_OK;
+}
+
+static char *oomHeapFaultDelay[] = {"0", "1", "2", NULL};
+static char *oomHeapFaultRepeat[] = {"1", NULL};
+
+static MunitParameterEnum oomParams[] = {
+    {TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay},
+    {TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat},
+    {NULL, NULL},
+};
+
+/* Out of memory conditions. */
+TEST(tcp_connect, oom, setUp, tearDown, 0, oomParams)
+{
+    struct fixture *f = data;
+    HEAP_FAULT_ENABLE;
+    CONNECT_ERROR(2, BOGUS_ADDRESS, RAFT_NOMEM, "out of memory");
+    return MUNIT_OK;
+}
+
+/* The transport is closed immediately after a connect request as been
+ * submitted. The request's callback is invoked with RAFT_CANCELED. */
+TEST(tcp_connect, closeImmediately, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    CONNECT_CLOSE(2, TCP_SERVER_ADDRESS, 0);
+    return MUNIT_OK;
+}
+
+/* The transport gets closed during the dns lookup */
+TEST(tcp_connect, closeDuringDnsLookup, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+
+    CONNECT_CLOSE(2, TCP_SERVER_ADDRESS, 1);
+    return MUNIT_OK;
+}
+
+/* The transport gets closed during the handshake. */
+TEST(tcp_connect, closeDuringHandshake, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+
+    /* This test fails for libuv version >= 1.44.2 due to changes in uv_run
+     * whereby queueing and processing the write_cb happen in the same loop
+     * iteration, not leaving us a chance to close without going through a lot
+     * of hoops.
+     * https://github.com/libuv/libuv/pull/3598 */
+    unsigned incompatible_uv = (1 << 16) | (44 << 8) | 2;
+    if (uv_version() >= incompatible_uv) {
+        CLOSE;
+        return MUNIT_SKIP;
+    }
+
+    CONNECT_CLOSE(2, TCP_SERVER_ADDRESS, 2);
+    return MUNIT_OK;
+}
+
+static void checkCb(struct uv_check_s *check)
+{
+    struct fixture *f = check->data;
+    CLOSE_SUBMIT;
+    uv_close((struct uv_handle_s *)check, NULL);
+}
+
+/* The transport gets closed right after a dns lookup failure, while the
+ * connection attempt is being aborted. */
+TEST(tcp_connect, closeDuringDnsLookupAbort, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    struct uv_check_s check;
+    int rv;
+    /* Use a check handle in order to close the transport in the same loop
+     * iteration where the dns failure lookup occurs */
+    rv = uv_check_init(&f->loop, &check);
+    munit_assert_int(rv, ==, 0);
+    check.data = f;
+    uv_check_start(&check, checkCb);
+    CONNECT_REQ(2, INVALID_ADDRESS, 0, RAFT_NOCONNECTION);
+    LOOP_RUN(1);
+    LOOP_RUN_UNTIL(&_result.done);
+    CLOSE_WAIT;
+    return MUNIT_OK;
+}
+
+/* The transport gets closed right after a connection failure, while the
+ * connection attempt is being aborted. */
+TEST(tcp_connect, closeDuringConnectAbort, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    struct uv_check_s check;
+    int rv;
+
+    /* Use a check handle in order to close the transport in the same loop
+     * iteration where the connection failure occurs. */
+    rv = uv_check_init(&f->loop, &check);
+    munit_assert_int(rv, ==, 0);
+    check.data = f;
+    CONNECT_REQ(2, BOGUS_ADDRESS, 0, RAFT_NOCONNECTION);
+    /* Successfull DNS lookup will initiate async connect */
+    LOOP_RUN(1);
+    uv_check_start(&check, checkCb);
+    LOOP_RUN(1);
+    LOOP_RUN_UNTIL(&_result.done);
+    CLOSE_WAIT;
+    return MUNIT_OK;
+}
+
+/* The transport gets closed right after the first connection attempt failed,
+ * while doing a second connection attempt. */
+TEST(tcp_connect, closeDuringSecondConnect, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    struct uv_check_s check;
+    int rv;
+    const struct AddrinfoResult results[] = {{"127.0.0.1", .6666},
+                                             {"127.0.0.1", TCP_SERVER_PORT}};
+
+    AddrinfoInjectSetResponse(0, 2, results);
+
+    /* Use a check handle in order to close the transport in the same loop
+     * iteration where the second connection attempt occurs. */
+    rv = uv_check_init(&f->loop, &check);
+    munit_assert_int(rv, ==, 0);
+    check.data = f;
+    CONNECT_REQ(2, "any-host", 0, RAFT_CANCELED);
+    /* Successfull DNS lookup will initiate async connect */
+    LOOP_RUN(1);
+    uv_check_start(&check, checkCb);
+    LOOP_RUN(1);
+    LOOP_RUN_UNTIL(&_result.done);
+    CLOSE_WAIT;
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_uv_tcp_listen.c b/test/raft/integration/test_uv_tcp_listen.c
new file mode 100644
index 000000000..b239cfa87
--- /dev/null
+++ b/test/raft/integration/test_uv_tcp_listen.c
@@ -0,0 +1,416 @@
+#include "../../../src/raft.h"
+#include "../../../src/raft/byte.h"
+#include "../lib/addrinfo.h"
+#include "../lib/heap.h"
+#include "../lib/loop.h"
+#include "../lib/runner.h"
+#include "../lib/tcp.h"
+
+/******************************************************************************
+ *
+ * Fixture with a TCP-based raft_uv_transport.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_HEAP;
+    FIXTURE_LOOP;
+    FIXTURE_TCP;
+    struct raft_uv_transport transport;
+    bool accepted;
+    bool closed;
+    struct
+    {
+        uint8_t buf[sizeof(uint64_t) + /* Protocol version */
+                    sizeof(uint64_t) + /* Server ID */
+                    sizeof(uint64_t) + /* Length of address */
+                    sizeof(uint64_t) * 2 /* Address */];
+        size_t offset;
+    } handshake;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+#define PEER_ID 2
+#define PEER_ADDRESS "127.0.0.1:666"
+
+static void closeCb(struct raft_uv_transport *transport)
+{
+    struct fixture *f = transport->data;
+    f->closed = true;
+}
+
+static void acceptCb(struct raft_uv_transport *t,
+                     raft_id id,
+                     const char *address,
+                     struct uv_stream_s *stream)
+{
+    struct fixture *f = t->data;
+    munit_assert_int(id, ==, PEER_ID);
+    munit_assert_string_equal(address, PEER_ADDRESS);
+    f->accepted = true;
+    uv_close((struct uv_handle_s *)stream, (uv_close_cb)raft_free);
+}
+
+#define INIT                                                                  \
+    do {                                                                      \
+        int _rv;                                                              \
+        f->transport.version = 1;                                             \
+        _rv = raft_uv_tcp_init(&f->transport, &f->loop);                      \
+        munit_assert_int(_rv, ==, 0);                                         \
+        const char *bind_addr = munit_parameters_get(params, "bind-address"); \
+        if (bind_addr && strlen(bind_addr)) {                                 \
+            _rv = raft_uv_tcp_set_bind_address(&f->transport, bind_addr);     \
+            munit_assert_int(_rv, ==, 0);                                     \
+        }                                                                     \
+        const char *address = munit_parameters_get(params, "address");        \
+        if (!address) {                                                       \
+            address = "127.0.0.1:9000";                                       \
+        }                                                                     \
+        _rv = f->transport.init(&f->transport, 1, address);                   \
+        munit_assert_int(_rv, ==, 0);                                         \
+        f->transport.data = f;                                                \
+        f->closed = false;                                                    \
+    } while (0)
+
+#define CLOSE                                       \
+    do {                                            \
+        f->transport.close(&f->transport, closeCb); \
+        LOOP_RUN_UNTIL(&f->closed);                 \
+        raft_uv_tcp_close(&f->transport);           \
+    } while (0)
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUpDeps(const MunitParameter params[],
+                       MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SET_UP_ADDRINFO;
+    SET_UP_HEAP;
+    SETUP_LOOP;
+    SETUP_TCP;
+    return f;
+}
+
+static void tearDownDeps(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_TCP;
+    TEAR_DOWN_LOOP;
+    TEAR_DOWN_HEAP;
+    TEAR_DOWN_ADDRINFO;
+    free(f);
+}
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = setUpDeps(params, user_data);
+    void *cursor;
+    /* test_tcp_listen(&f->tcp); */
+    INIT;
+    f->accepted = false;
+    f->handshake.offset = 0;
+
+    cursor = f->handshake.buf;
+    bytePut64(&cursor, 1);
+    bytePut64(&cursor, PEER_ID);
+    bytePut64(&cursor, 16);
+    strcpy(cursor, PEER_ADDRESS);
+
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    CLOSE;
+    tearDownDeps(f);
+}
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+#define LISTEN(EXPECTED_RV)                                \
+    do {                                                   \
+        int rv;                                            \
+        rv = f->transport.listen(&f->transport, acceptCb); \
+        munit_assert_int(rv, ==, EXPECTED_RV);             \
+    } while (false)
+
+/* Connect to the listening socket of the transport, creating a new connection
+ * that is waiting to be accepted. */
+#define PEER_CONNECT TCP_CLIENT_CONNECT(9000)
+
+/* Make the peer close the connection. */
+#define PEER_CLOSE TCP_CLIENT_CLOSE
+
+/* Make the connected client send handshake data. */
+#define PEER_HANDSHAKE                        \
+    do {                                      \
+        size_t n = sizeof f->handshake.buf;   \
+        TCP_CLIENT_SEND(f->handshake.buf, n); \
+    } while (0)
+
+/* Make the connected client send partial handshake data: only N bytes will be
+ * sent, starting from the offset of the last call. */
+#define PEER_HANDSHAKE_PARTIAL(N)                                   \
+    do {                                                            \
+        TCP_CLIENT_SEND(f->handshake.buf + f->handshake.offset, N); \
+    } while (0)
+
+/* After a PEER_CONNECT() call, spin the event loop until the connected
+ * callback of the listening TCP handle gets called. */
+#define LOOP_RUN_UNTIL_CONNECTED LOOP_RUN(1);
+
+/* After a PEER_HANDSHAKE_PARTIAL() call, spin the event loop until the read
+ * callback gets called. */
+#define LOOP_RUN_UNTIL_READ LOOP_RUN(1);
+
+/* Spin the event loop until the accept callback gets eventually invoked. */
+#define ACCEPT LOOP_RUN_UNTIL(&f->accepted);
+
+/******************************************************************************
+ *
+ * Success scenarios.
+ *
+ *****************************************************************************/
+
+SUITE(tcp_listen)
+
+/* Parameters for listen address */
+
+static char *validAddresses[] = {"127.0.0.1:9000", "localhost:9000", NULL};
+
+static char *validBindAddresses[] = {
+    "", "127.0.0.1:9000", "localhost:9000", ":9000", "0.0.0.0:9000", NULL};
+
+static MunitParameterEnum validListenParams[] = {
+    {"address", validAddresses},
+    {"bind-address", validBindAddresses},
+    {NULL, NULL},
+};
+
+/* If the handshake is successful, the accept callback is invoked. */
+TEST(tcp_listen, success, setUp, tearDown, 0, validListenParams)
+{
+    struct fixture *f = data;
+    LISTEN(0);
+    PEER_CONNECT;
+    PEER_HANDSHAKE;
+    ACCEPT;
+    return MUNIT_OK;
+}
+
+/* Parameters for invalid listen addresses */
+static char *invalidAddresses[] = {"500.1.2.3:9000", "not-existing:9000",
+                                   "192.0.2.0:9000", NULL};
+
+static char *invalidBindAddresses[] = {
+    "", "500.1.2.3:9000", "not-existing:9000", "192.0.2.0:9000", NULL};
+
+static MunitParameterEnum invalidTcpListenParams[] = {
+    {"address", invalidAddresses},
+    {"bind-address", invalidBindAddresses},
+    {NULL, NULL},
+};
+
+/* Check error on invalid hostname specified */
+TEST(tcp_listen, invalidAddress, setUp, tearDown, 0, invalidTcpListenParams)
+{
+    struct fixture *f = data;
+    LISTEN(RAFT_IOERR);
+    return MUNIT_OK;
+}
+
+/* Check success with addrinfo resolve to mutiple IP and first one is used to
+ * connect */
+TEST(tcp_listen, firstOfTwo, setUp, tearDown, 0, NULL)
+{
+    const struct AddrinfoResult results[] = {{"127.0.0.1", 9000},
+                                             {"127.0.0.2", 9000}};
+    struct fixture *f = data;
+    AddrinfoInjectSetResponse(0, 2, results);
+    LISTEN(0);
+    PEER_CONNECT;
+    PEER_HANDSHAKE;
+    ACCEPT;
+    return MUNIT_OK;
+}
+
+/* Check success with addrinfo resolve to mutiple IP and second one is used to
+ * connect */
+TEST(tcp_listen, secondOfTwo, setUp, tearDown, 0, NULL)
+{
+    const struct AddrinfoResult results[] = {{"127.0.0.2", 9000},
+                                             {"127.0.0.1", 9000}};
+    struct fixture *f = data;
+    AddrinfoInjectSetResponse(0, 2, results);
+
+    LISTEN(0);
+    PEER_CONNECT;
+    PEER_HANDSHAKE;
+    ACCEPT;
+    return MUNIT_OK;
+}
+
+/* Simulate port already in use error by addrinfo response contain the same IP
+ * twice */
+TEST(tcp_listen, alreadyBound, setUp, tearDown, 0, NULL)
+{
+    /* We need to use the same endpoint three times as a simple duplicate will
+     * be skipped due to a glib strange behavior
+     * https://bugzilla.redhat.com/show_bug.cgi?id=496300  */
+    const struct AddrinfoResult results[] = {
+        {"127.0.0.1", 9000}, {"127.0.0.1", 9000}, {"127.0.0.1", 9000}};
+    struct fixture *f = data;
+    AddrinfoInjectSetResponse(0, 3, results);
+    LISTEN(RAFT_IOERR);
+    return MUNIT_OK;
+}
+
+/* Error in bind first IP address */
+TEST(tcp_listen, cannotBindFirst, setUp, tearDown, 0, NULL)
+{
+    const struct AddrinfoResult results[] = {{"192.0.2.0", 9000},
+                                             {"127.0.0.1", 9000}};
+    struct fixture *f = data;
+    AddrinfoInjectSetResponse(0, 2, results);
+    LISTEN(RAFT_IOERR);
+    return MUNIT_OK;
+}
+
+/* Error in bind of second IP address */
+TEST(tcp_listen, cannotBindSecond, setUp, tearDown, 0, NULL)
+{
+    const struct AddrinfoResult results[] = {{"127.0.0.1", 9000},
+                                             {"192.0.2.0", 9000}};
+    struct fixture *f = data;
+    AddrinfoInjectSetResponse(0, 2, results);
+    LISTEN(RAFT_IOERR);
+    return MUNIT_OK;
+}
+
+/* Check error on general dns server failure */
+TEST(tcp_listen, resolveFailure, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    AddrinfoInjectSetResponse(EAI_FAIL, 0, NULL);
+    LISTEN(RAFT_IOERR);
+    return MUNIT_OK;
+}
+
+/* The client sends us a bad protocol version */
+TEST(tcp_listen, badProtocol, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    LISTEN(0);
+    memset(f->handshake.buf, 999, sizeof(uint64_t));
+    PEER_CONNECT;
+    PEER_HANDSHAKE;
+    LOOP_RUN_UNTIL_CONNECTED;
+    LOOP_RUN_UNTIL_READ;
+    return MUNIT_OK;
+}
+
+/* Parameters for sending a partial handshake */
+static char *partialHandshakeN[] = {"8", "16", "24", "32", NULL};
+
+static MunitParameterEnum peerAbortParams[] = {
+    {"n", partialHandshakeN},
+    {NULL, NULL},
+};
+
+/* The peer closes the connection after having sent a partial handshake. */
+TEST(tcp_listen, peerAbort, setUp, tearDown, 0, peerAbortParams)
+{
+    struct fixture *f = data;
+    LISTEN(0);
+    const char *n = munit_parameters_get(params, "n");
+    PEER_CONNECT;
+    PEER_HANDSHAKE_PARTIAL(atoi(n));
+    LOOP_RUN_UNTIL_CONNECTED;
+    LOOP_RUN_UNTIL_READ;
+    PEER_CLOSE;
+    return MUNIT_OK;
+}
+
+/* TODO: skip "2" because it makes libuv crash, as it calls abort(). See also
+ * https://github.com/libuv/libuv/issues/1948 */
+static char *oomHeapFaultDelay[] = {"0", "1", "3", NULL};
+static char *oomHeapFaultRepeat[] = {"1", NULL};
+
+static MunitParameterEnum oomParams[] = {
+    {TEST_HEAP_FAULT_DELAY, oomHeapFaultDelay},
+    {TEST_HEAP_FAULT_REPEAT, oomHeapFaultRepeat},
+    {NULL, NULL},
+};
+
+/* Out of memory conditions */
+TEST(tcp_listen, oom, setUp, tearDown, 0, oomParams)
+{
+    struct fixture *f = data;
+    LISTEN(0);
+    PEER_CONNECT;
+    PEER_HANDSHAKE;
+    HEAP_FAULT_ENABLE;
+
+    /* Run as much as possible. */
+    uv_run(&f->loop, UV_RUN_NOWAIT);
+    uv_run(&f->loop, UV_RUN_NOWAIT);
+    uv_run(&f->loop, UV_RUN_NOWAIT);
+
+    return MUNIT_OK;
+}
+
+/* Close the transport right after an incoming connection becomes pending, but
+ * it hasn't been accepted yet. */
+TEST(tcp_listen, pending, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    LISTEN(0);
+    PEER_CONNECT;
+    return MUNIT_OK;
+}
+
+/* Close the transport right after an incoming connection gets accepted, and the
+ * peer hasn't sent handshake data yet. */
+TEST(tcp_listen, closeBeforeHandshake, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    LISTEN(0);
+    PEER_CONNECT;
+    LOOP_RUN_UNTIL_CONNECTED;
+    return MUNIT_OK;
+}
+
+static MunitParameterEnum closeDuringHandshake[] = {
+    {"n", partialHandshakeN},
+    {NULL, NULL},
+};
+
+/* Close the transport right after the peer has started to send handshake data,
+ * but isn't done with it yet. */
+TEST(tcp_listen, handshake, setUp, tearDown, 0, closeDuringHandshake)
+{
+    struct fixture *f = data;
+    LISTEN(0);
+    const char *n_param = munit_parameters_get(params, "n");
+    PEER_CONNECT;
+    PEER_HANDSHAKE_PARTIAL(atoi(n_param));
+    LOOP_RUN_UNTIL_CONNECTED;
+    LOOP_RUN_UNTIL_READ;
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_uv_truncate.c b/test/raft/integration/test_uv_truncate.c
new file mode 100644
index 000000000..b702d2669
--- /dev/null
+++ b/test/raft/integration/test_uv_truncate.c
@@ -0,0 +1,296 @@
+#include "../lib/runner.h"
+#include "../lib/uv.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_UV_DEPS;
+    FIXTURE_UV;
+    int count; /* To generate deterministic entry data */
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+struct result
+{
+    int status;
+    bool done;
+};
+
+static void appendCbAssertResult(struct raft_io_append *req, int status)
+{
+    struct result *result = req->data;
+    munit_assert_int(status, ==, result->status);
+    result->done = true;
+}
+
+/* Declare and fill the entries array for the append request identified by
+ * I. The array will have N entries, and each entry will have a data buffer of
+ * SIZE bytes.*/
+#define ENTRIES(I, N, SIZE)                                 \
+    struct raft_entry _entries##I[N];                       \
+    uint8_t _entries_data##I[N * SIZE];                     \
+    do {                                                    \
+        int _i;                                             \
+        for (_i = 0; _i < N; _i++) {                        \
+            struct raft_entry *entry = &_entries##I[_i];    \
+            entry->term = 1;                                \
+            entry->type = RAFT_COMMAND;                     \
+            entry->buf.base = &_entries_data##I[_i * SIZE]; \
+            entry->buf.len = SIZE;                          \
+            entry->batch = NULL;                            \
+            munit_assert_ptr_not_null(entry->buf.base);     \
+            memset(entry->buf.base, 0, entry->buf.len);     \
+            f->count++;                                     \
+            *(uint64_t *)entry->buf.base = f->count;        \
+        }                                                   \
+    } while (0)
+
+/* Submit an append request identified by I, with N_ENTRIES entries, each one of
+ * size ENTRY_SIZE). */
+#define APPEND_SUBMIT(I, N_ENTRIES, ENTRY_SIZE)                     \
+    struct raft_io_append _req##I;                                  \
+    struct result _result##I = {0, false};                          \
+    int _rv##I;                                                     \
+    ENTRIES(I, N_ENTRIES, ENTRY_SIZE);                              \
+    _req##I.data = &_result##I;                                     \
+    _rv##I = f->io.append(&f->io, &_req##I, _entries##I, N_ENTRIES, \
+                          appendCbAssertResult);                    \
+    munit_assert_int(_rv##I, ==, 0)
+
+/* Wait for the append request identified by I to complete. */
+#define APPEND_WAIT(I) LOOP_RUN_UNTIL(&_result##I.done)
+
+#define APPEND_EXPECT(I, STATUS) _result##I.status = STATUS
+
+/* Submit an append request and wait for it to successfully complete. */
+#define APPEND(N)                  \
+    do {                           \
+        APPEND_SUBMIT(9999, N, 8); \
+        APPEND_WAIT(9999);         \
+    } while (0)
+
+#define TRUNCATE(N)                      \
+    do {                                 \
+        int rv_;                         \
+        rv_ = f->io.truncate(&f->io, N); \
+        munit_assert_int(rv_, ==, 0);    \
+    } while (0)
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_UV_DEPS;
+    SETUP_UV;
+    f->count = 0;
+    return f;
+}
+
+static void tearDownDeps(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_UV_DEPS;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Assertions
+ *
+ *****************************************************************************/
+
+/* Shutdown the fixture's raft_io instance, then load all entries on disk using
+ * a new raft_io instance, and assert that there are N entries with data
+ * matching the DATA array. */
+#define ASSERT_ENTRIES(N, ...)                                               \
+    TEAR_DOWN_UV;                                                            \
+    do {                                                                     \
+        struct uv_loop_s _loop;                                              \
+        struct raft_uv_transport _transport;                                 \
+        struct raft_io _io;                                                  \
+        raft_term _term;                                                     \
+        raft_id _voted_for;                                                  \
+        struct raft_snapshot *_snapshot;                                     \
+        raft_index _start_index;                                             \
+        struct raft_entry *_entries;                                         \
+        size_t _i;                                                           \
+        size_t _n;                                                           \
+        void *_batch = NULL;                                                 \
+        unsigned _data[N] = {__VA_ARGS__};                                   \
+        int _rv;                                                             \
+                                                                             \
+        _rv = uv_loop_init(&_loop);                                          \
+        munit_assert_int(_rv, ==, 0);                                        \
+        _transport.version = 1;                                              \
+        _rv = raft_uv_tcp_init(&_transport, &_loop);                         \
+        munit_assert_int(_rv, ==, 0);                                        \
+        _rv = raft_uv_init(&_io, &_loop, f->dir, &_transport);               \
+        munit_assert_int(_rv, ==, 0);                                        \
+        _rv = _io.init(&_io, 1, "1");                                        \
+        munit_assert_int(_rv, ==, 0);                                        \
+        _rv = _io.load(&_io, &_term, &_voted_for, &_snapshot, &_start_index, \
+                       &_entries, &_n);                                      \
+        munit_assert_int(_rv, ==, 0);                                        \
+        _io.close(&_io, NULL);                                               \
+        uv_run(&_loop, UV_RUN_NOWAIT);                                       \
+        raft_uv_close(&_io);                                                 \
+        raft_uv_tcp_close(&_transport);                                      \
+        uv_loop_close(&_loop);                                               \
+                                                                             \
+        munit_assert_ptr_null(_snapshot);                                    \
+        munit_assert_int(_n, ==, N);                                         \
+        for (_i = 0; _i < _n; _i++) {                                        \
+            struct raft_entry *_entry = &_entries[_i];                       \
+            uint64_t _value = *(uint64_t *)_entry->buf.base;                 \
+            munit_assert_int(_entry->term, ==, 1);                           \
+            munit_assert_int(_entry->type, ==, RAFT_COMMAND);                \
+            munit_assert_int(_value, ==, _data[_i]);                         \
+            munit_assert_ptr_not_null(_entry->batch);                        \
+        }                                                                    \
+        for (_i = 0; _i < _n; _i++) {                                        \
+            struct raft_entry *_entry = &_entries[_i];                       \
+            if (_entry->batch != _batch) {                                   \
+                _batch = _entry->batch;                                      \
+                raft_free(_batch);                                           \
+            }                                                                \
+        }                                                                    \
+        raft_free(_entries);                                                 \
+    } while (0);
+
+/******************************************************************************
+ *
+ * raft_io->truncate()
+ *
+ *****************************************************************************/
+
+SUITE(truncate)
+
+/* If the index to truncate is at the start of a segment, that segment and all
+ * subsequent ones are removed. */
+TEST(truncate, wholeSegment, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(3);
+    TRUNCATE(1);
+    APPEND(1);
+    ASSERT_ENTRIES(1 /* n entries */, 4 /* entries data */);
+    return MUNIT_OK;
+}
+
+/* The index to truncate is the same as the last appended entry. */
+TEST(truncate, sameAsLastIndex, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(3);
+    TRUNCATE(3);
+    APPEND(1);
+    ASSERT_ENTRIES(3 /* n entries */, 1, 2, 4 /* entries data */);
+    return MUNIT_OK;
+}
+
+/* If the index to truncate is not at the start of a segment, that segment gets
+ * truncated. */
+TEST(truncate, partialSegment, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(3);
+    APPEND(1);
+    TRUNCATE(2);
+    APPEND(1);
+    ASSERT_ENTRIES(2,   /* n entries */
+                   1, 5 /* entries data */
+    );
+    return MUNIT_OK;
+}
+
+/* The truncate request is issued while an append request is still pending. */
+TEST(truncate, pendingAppend, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_SUBMIT(0, /* request ID */
+                  3, /* n entries */
+                  8  /* entry size */
+    );
+    TRUNCATE(2 /* truncation index */);
+    APPEND(1);
+    ASSERT_ENTRIES(2,   /* n entries */
+                   1, 4 /* entries data */
+    );
+    return MUNIT_OK;
+}
+
+/* Multiple truncate requests pending at the same time. */
+TEST(truncate, multiplePending, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_SUBMIT(0, /* request ID */
+                  3, /* n entries */
+                  8  /* entry size */
+    );
+    TRUNCATE(2 /* truncation index */);
+    APPEND_SUBMIT(1, /* request ID */
+                  2, /* n entries */
+                  8  /* entry size */
+    );
+    TRUNCATE(3 /* truncation index */);
+    APPEND(1);
+    ASSERT_ENTRIES(3,      /* n entries */
+                   1, 4, 6 /* entries data */
+    );
+    return MUNIT_OK;
+}
+
+/* The truncate request gets canceled because we're closing. */
+TEST(truncate, closing, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_SUBMIT(0, /* request ID */
+                  3, /* n entries */
+                  8  /* entry size */
+    );
+    TRUNCATE(2 /* truncation index */);
+    APPEND_EXPECT(0,            /* request ID */
+                  RAFT_CANCELED /* status */
+    );
+    TEAR_DOWN_UV;
+    return MUNIT_OK;
+}
+
+/* Multiple truncate requests get canceled because we're closing. */
+TEST(truncate, closingMultiple, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_SUBMIT(0, /* request ID */
+                  3, /* n entries */
+                  8  /* entry size */
+    );
+    TRUNCATE(2 /* truncation index */);
+    APPEND_SUBMIT(1, /* request ID */
+                  2, /* n entries */
+                  8  /* entry size */
+    );
+    TRUNCATE(3 /* truncation index */);
+    APPEND_EXPECT(0,            /* request ID */
+                  RAFT_CANCELED /* status */
+    );
+    APPEND_EXPECT(1,            /* request ID */
+                  RAFT_CANCELED /* status */
+    );
+    TEAR_DOWN_UV;
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_uv_truncate_snapshot.c b/test/raft/integration/test_uv_truncate_snapshot.c
new file mode 100644
index 000000000..adbe88398
--- /dev/null
+++ b/test/raft/integration/test_uv_truncate_snapshot.c
@@ -0,0 +1,244 @@
+#include "../lib/runner.h"
+#include "../lib/uv.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_UV_DEPS;
+    FIXTURE_UV;
+    int count; /* To generate deterministic entry data */
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+/* Maximum number of blocks a segment can have */
+#define MAX_SEGMENT_BLOCKS 4
+
+/* This block size should work fine for all file systems. */
+#define SEGMENT_BLOCK_SIZE 4096
+
+/* Default segment size */
+#define SEGMENT_SIZE 4096 * MAX_SEGMENT_BLOCKS
+
+struct result
+{
+    int status;
+    bool done;
+    void *data;
+};
+
+static void appendCbAssertResult(struct raft_io_append *req, int status)
+{
+    struct result *result = req->data;
+    munit_assert_int(status, ==, result->status);
+    result->done = true;
+}
+
+static void snapshotPutCbAssertResult(struct raft_io_snapshot_put *req,
+                                      int status)
+{
+    struct result *result = req->data;
+    munit_assert_int(status, ==, result->status);
+    result->done = true;
+}
+
+/* Declare and fill the entries array for the append request identified by
+ * I. The array will have N entries, and each entry will have a data buffer of
+ * SIZE bytes.*/
+#define ENTRIES(I, N, SIZE)                                 \
+    struct raft_entry _entries##I[N];                       \
+    uint8_t _entries_data##I[N * SIZE];                     \
+    do {                                                    \
+        int _i;                                             \
+        for (_i = 0; _i < N; _i++) {                        \
+            struct raft_entry *entry = &_entries##I[_i];    \
+            entry->term = 1;                                \
+            entry->type = RAFT_COMMAND;                     \
+            entry->buf.base = &_entries_data##I[_i * SIZE]; \
+            entry->buf.len = SIZE;                          \
+            entry->batch = NULL;                            \
+            munit_assert_ptr_not_null(entry->buf.base);     \
+            memset(entry->buf.base, 0, entry->buf.len);     \
+            f->count++;                                     \
+            *(uint64_t *)entry->buf.base = f->count;        \
+        }                                                   \
+    } while (0)
+
+/* Submit an append request identified by I, with N_ENTRIES entries, each one of
+ * size ENTRY_SIZE). */
+#define APPEND_SUBMIT(I, N_ENTRIES, ENTRY_SIZE)                     \
+    struct raft_io_append _req##I;                                  \
+    struct result _result##I = {0, false, NULL};                    \
+    int _rv##I;                                                     \
+    ENTRIES(I, N_ENTRIES, ENTRY_SIZE);                              \
+    _req##I.data = &_result##I;                                     \
+    _rv##I = f->io.append(&f->io, &_req##I, _entries##I, N_ENTRIES, \
+                          appendCbAssertResult);                    \
+    munit_assert_int(_rv##I, ==, 0)
+
+#define TRUNCATE(N)                      \
+    do {                                 \
+        int rv_;                         \
+        rv_ = f->io.truncate(&f->io, N); \
+        munit_assert_int(rv_, ==, 0);    \
+    } while (0)
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_UV_DEPS;
+    SETUP_UV;
+    raft_uv_set_block_size(&f->io, SEGMENT_BLOCK_SIZE);
+    raft_uv_set_segment_size(&f->io, SEGMENT_SIZE);
+    f->count = 0;
+    return f;
+}
+
+static void tearDownDeps(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_UV_DEPS;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Assertions
+ *
+ *****************************************************************************/
+
+/* Shutdown the fixture's raft_io instance, then load all entries on disk using
+ * a new raft_io instance, and assert that there are N entries with data
+ * matching the DATA array. */
+#define ASSERT_ENTRIES(N, ...)                                            \
+    TEAR_DOWN_UV;                                                         \
+    do {                                                                  \
+        struct uv_loop_s _loop;                                           \
+        struct raft_uv_transport _transport;                              \
+        struct raft_io _io;                                               \
+        raft_term _term;                                                  \
+        raft_id _voted_for;                                               \
+        struct raft_snapshot *_snap;                                      \
+        raft_index _start_index;                                          \
+        struct raft_entry *_entries;                                      \
+        size_t _i;                                                        \
+        size_t _n;                                                        \
+        void *_batch = NULL;                                              \
+        unsigned _data[N] = {__VA_ARGS__};                                \
+        int _ret;                                                         \
+                                                                          \
+        _ret = uv_loop_init(&_loop);                                      \
+        munit_assert_int(_ret, ==, 0);                                    \
+        _transport.version = 1;                                           \
+        _ret = raft_uv_tcp_init(&_transport, &_loop);                     \
+        munit_assert_int(_ret, ==, 0);                                    \
+        _ret = raft_uv_init(&_io, &_loop, f->dir, &_transport);           \
+        munit_assert_int(_ret, ==, 0);                                    \
+        _ret = _io.init(&_io, 1, "1");                                    \
+        munit_assert_int(_ret, ==, 0);                                    \
+        _ret = _io.load(&_io, &_term, &_voted_for, &_snap, &_start_index, \
+                        &_entries, &_n);                                  \
+        munit_assert_int(_ret, ==, 0);                                    \
+        _io.close(&_io, NULL);                                            \
+        uv_run(&_loop, UV_RUN_NOWAIT);                                    \
+        raft_uv_close(&_io);                                              \
+        raft_uv_tcp_close(&_transport);                                   \
+        uv_loop_close(&_loop);                                            \
+                                                                          \
+        munit_assert_size(_n, ==, N);                                     \
+        for (_i = 0; _i < _n; _i++) {                                     \
+            struct raft_entry *_entry = &_entries[_i];                    \
+            uint64_t _value = *(uint64_t *)_entry->buf.base;              \
+            munit_assert_int(_entry->term, ==, 1);                        \
+            munit_assert_int(_entry->type, ==, RAFT_COMMAND);             \
+            munit_assert_int(_value, ==, _data[_i]);                      \
+            munit_assert_ptr_not_null(_entry->batch);                     \
+        }                                                                 \
+        for (_i = 0; _i < _n; _i++) {                                     \
+            struct raft_entry *_entry = &_entries[_i];                    \
+            if (_entry->batch != _batch) {                                \
+                _batch = _entry->batch;                                   \
+                raft_free(_batch);                                        \
+            }                                                             \
+        }                                                                 \
+        raft_free(_entries);                                              \
+        if (_snap != NULL) {                                              \
+            raft_configuration_close(&_snap->configuration);              \
+            munit_assert_int(_snap->n_bufs, ==, 1);                       \
+            raft_free(_snap->bufs[0].base);                               \
+            raft_free(_snap->bufs);                                       \
+            raft_free(_snap);                                             \
+        }                                                                 \
+    } while (0);
+
+#define SNAPSHOT_PUT_REQ(TRAILING, INDEX, RV, STATUS)              \
+    struct raft_snapshot _snapshot;                                \
+    struct raft_buffer _snapshot_buf;                              \
+    uint64_t _snapshot_data;                                       \
+    struct raft_io_snapshot_put _req;                              \
+    struct result _result = {STATUS, false, NULL};                 \
+    int _rv;                                                       \
+    _snapshot.term = 1;                                            \
+    _snapshot.index = INDEX;                                       \
+    raft_configuration_init(&_snapshot.configuration);             \
+    _rv = raft_configuration_add(&_snapshot.configuration, 1, "1", \
+                                 RAFT_STANDBY);                    \
+    munit_assert_int(_rv, ==, 0);                                  \
+    _snapshot.bufs = &_snapshot_buf;                               \
+    _snapshot.n_bufs = 1;                                          \
+    _snapshot_buf.base = &_snapshot_data;                          \
+    _snapshot_buf.len = sizeof _snapshot_data;                     \
+    _req.data = &_result;                                          \
+    _rv = f->io.snapshot_put(&f->io, TRAILING, &_req, &_snapshot,  \
+                             snapshotPutCbAssertResult);           \
+    munit_assert_int(_rv, ==, RV)
+
+#define SNAPSHOT_CLEANUP() raft_configuration_close(&_snapshot.configuration)
+
+/******************************************************************************
+ *
+ * test interaction of raft_io->snapshot_put and raft_io->truncate()
+ *
+ *****************************************************************************/
+
+SUITE(snapshot_truncate)
+
+/* Fill up 3 segments worth of data, then take a snapshot.
+ * While the snapshot is taken, start a truncate request. */
+TEST(snapshot_truncate, snapshotThenTruncate, setUp, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_SUBMIT(0, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
+    APPEND_SUBMIT(1, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
+    APPEND_SUBMIT(2, MAX_SEGMENT_BLOCKS, SEGMENT_BLOCK_SIZE);
+
+    /* Take a snapshot, this will use a uv_barrier. */
+    SNAPSHOT_PUT_REQ(8192, 6, 0, 0);
+
+    /* Truncate, this will use a uv_barrier too.  */
+    TRUNCATE(8);
+
+    /* There's no truncate callback to wait for, loop for a while. */
+    LOOP_RUN(1000);
+
+    /* Check that truncate has done its job. */
+    ASSERT_ENTRIES(7, 1, 2, 3, 4, 5, 6, 7);
+
+    SNAPSHOT_CLEANUP();
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_uv_work.c b/test/raft/integration/test_uv_work.c
new file mode 100644
index 000000000..14bfc41da
--- /dev/null
+++ b/test/raft/integration/test_uv_work.c
@@ -0,0 +1,103 @@
+#include <unistd.h>
+
+#include "../../../src/raft/uv.h"
+#include "../lib/dir.h"
+#include "../lib/loop.h"
+#include "../lib/runner.h"
+#include "../lib/uv.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_UV_DEPS;
+    FIXTURE_UV;
+};
+
+struct result
+{
+    int rv;      /* Indicate success or failure of the work */
+    int counter; /* Proof that work was performed */
+    bool done;   /* To check test termination */
+};
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_UV_DEPS;
+    SETUP_UV;
+    return f;
+}
+
+static void tearDownDeps(void *data)
+{
+    struct fixture *f = data;
+    if (f == NULL) {
+        return;
+    }
+    TEAR_DOWN_UV_DEPS;
+    free(f);
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    if (f == NULL) {
+        return;
+    }
+    TEAR_DOWN_UV;
+    tearDownDeps(f);
+}
+
+/******************************************************************************
+ *
+ * UvAsyncWork
+ *
+ *****************************************************************************/
+
+static void asyncWorkCbAssertResult(struct raft_io_async_work *req, int status)
+{
+    struct result *r = req->data;
+    munit_assert_int(status, ==, r->rv);
+    munit_assert_int(r->counter, ==, 1);
+    r->done = true;
+}
+
+static int asyncWorkFn(struct raft_io_async_work *req)
+{
+    struct result *r = req->data;
+    sleep(1);
+    r->counter = 1;
+    return r->rv;
+}
+
+SUITE(UvAsyncWork)
+
+static char *rvs[] = {"-1", "0", "1", "37", NULL};
+static MunitParameterEnum rvs_params[] = {
+    {"rv", rvs},
+    {NULL, NULL},
+};
+
+TEST(UvAsyncWork, work, setUp, tearDown, 0, rvs_params)
+{
+    struct fixture *f = data;
+    struct result res = {0};
+    struct raft_io_async_work req = {0};
+    res.rv = (int)strtol(munit_parameters_get(params, "rv"), NULL, 0);
+    req.data = &res;
+    req.work = asyncWorkFn;
+    UvAsyncWork(&f->io, &req, asyncWorkCbAssertResult);
+    LOOP_RUN_UNTIL(&res.done);
+    return MUNIT_OK;
+}
diff --git a/test/raft/integration/test_voter_contacts.c b/test/raft/integration/test_voter_contacts.c
new file mode 100644
index 000000000..ab6405db0
--- /dev/null
+++ b/test/raft/integration/test_voter_contacts.c
@@ -0,0 +1,105 @@
+#include "../lib/cluster.h"
+#include "../lib/runner.h"
+
+#define N_SERVERS 3
+
+/******************************************************************************
+ *
+ * Fixture with a test raft cluster.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_CLUSTER;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+#define STEP_N(N) raft_fixture_step_n(&f->cluster, N)
+
+/******************************************************************************
+ *
+ * Set up a cluster with a three servers.
+ *
+ *****************************************************************************/
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SETUP_CLUSTER(N_SERVERS);
+    CLUSTER_BOOTSTRAP;
+    CLUSTER_START;
+    CLUSTER_ELECT(0);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_CLUSTER;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * raft_voter_contacts
+ *
+ *****************************************************************************/
+
+SUITE(raft_voter_contacts)
+
+TEST(raft_voter_contacts, upToDate, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+
+    CLUSTER_STEP_UNTIL_HAS_LEADER(1000);
+    CLUSTER_STEP_N(1000);
+
+    /* N node cluster with leader */
+    for (unsigned int i = 0; i < N_SERVERS; i++) {
+        int count = raft_voter_contacts(CLUSTER_RAFT(i));
+        if (i == CLUSTER_LEADER) {
+            munit_assert_int(count, ==, N_SERVERS);
+        } else {
+            munit_assert_int(count, ==, -1);
+        }
+    }
+
+    /* Kill the cluster leader, so a new leader is elected and the number of
+     * voters should be decreased */
+    unsigned int leader = CLUSTER_LEADER;
+    CLUSTER_KILL(leader);
+    CLUSTER_STEP_UNTIL_HAS_LEADER(1000);
+    CLUSTER_STEP_N(1000);
+
+    for (unsigned int i = 0; i < N_SERVERS; i++) {
+        if (i == leader) {
+            continue;
+        }
+        int count = raft_voter_contacts(CLUSTER_RAFT(i));
+        if (i == CLUSTER_LEADER) {
+            munit_assert_int(count, ==, N_SERVERS - 1);
+        } else {
+            munit_assert_int(count, ==, -1);
+        }
+    }
+
+    /* Revive the old leader, so the count should go back up */
+    CLUSTER_REVIVE(leader);
+    CLUSTER_STEP_N(1000);
+    for (unsigned int i = 0; i < N_SERVERS; i++) {
+        int count = raft_voter_contacts(CLUSTER_RAFT(i));
+        if (i == CLUSTER_LEADER) {
+            munit_assert_int(count, ==, N_SERVERS);
+        } else {
+            munit_assert_int(count, ==, -1);
+        }
+    }
+
+    return MUNIT_OK;
+}
diff --git a/test/raft/lib/addrinfo.c b/test/raft/lib/addrinfo.c
new file mode 100644
index 000000000..532ddab5f
--- /dev/null
+++ b/test/raft/lib/addrinfo.c
@@ -0,0 +1,173 @@
+#include "addrinfo.h"
+
+#include <uv.h>
+
+#include <dlfcn.h>
+#include <netdb.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+bool addrinfo_mock_enabled = false;
+
+enum addrinfo_mock_state { MockResultSet, MockResultReturned, SystemResult };
+
+struct addrinfo_mock_data
+{
+    enum addrinfo_mock_state state;
+    int rv;
+    struct addrinfo *result;
+    struct addrinfo_mock_data *next;
+};
+
+static struct addrinfo_mock_data *addrinfo_data;
+
+void AddrinfoInjectSetUp(MUNIT_UNUSED const MunitParameter params[])
+{
+    munit_assert_int(addrinfo_mock_enabled, ==, false);
+    munit_assert_ptr((void *)addrinfo_data, ==, NULL);
+    addrinfo_mock_enabled = true;
+}
+
+void AddrinfoInjectTearDown(void)
+{
+    munit_assert_int(addrinfo_mock_enabled, ==, true);
+    // If data is not freed the freeaddrinfo was not invoked.
+    munit_assert_ptr((void *)addrinfo_data, ==, NULL);
+    addrinfo_mock_enabled = false;
+}
+
+void AddrinfoInjectSetResponse(int rv,
+                               int num_results,
+                               const struct AddrinfoResult *results)
+{
+    munit_assert_int(addrinfo_mock_enabled, ==, true);
+    munit_assert(!addrinfo_data || addrinfo_data->state == MockResultReturned);
+    munit_assert(rv || (num_results && results));
+
+    struct addrinfo_mock_data *response =
+        malloc(sizeof(struct addrinfo_mock_data));
+    munit_assert_ptr((void *)response, !=, NULL);
+    response->state = MockResultSet;
+    response->rv = rv;
+    response->result = NULL;
+    for (int i = num_results - 1; i >= 0; --i) {
+        struct sockaddr_in *addr_in = malloc(sizeof(struct sockaddr_in));
+        munit_assert_ptr((void *)addr_in, !=, NULL);
+        munit_assert_int(uv_ip4_addr(results[i].ip, results[i].port, addr_in),
+                         ==, 0);
+
+        struct addrinfo *ai = malloc(sizeof(struct addrinfo));
+        munit_assert_ptr((void *)ai, !=, NULL);
+        ai->ai_flags = 0;
+        ai->ai_family = AF_INET;
+        ai->ai_socktype = SOCK_STREAM;
+        ai->ai_protocol = IPPROTO_TCP;
+        ai->ai_addrlen = sizeof(struct sockaddr_in);
+        ai->ai_addr = (struct sockaddr *)addr_in;
+        ai->ai_canonname = NULL;
+        ai->ai_next = response->result;
+        response->result = ai;
+    }
+    response->next = addrinfo_data;
+    addrinfo_data = response;
+}
+
+static int invoke_system_getaddrinfo(const char *node,
+                                     const char *service,
+                                     const struct addrinfo *hints,
+                                     struct addrinfo **res)
+{
+    int (*system_getaddrinfo)(const char *node, const char *service,
+                              const struct addrinfo *hints,
+                              struct addrinfo **res);
+    *(void **)(&system_getaddrinfo) = dlsym(RTLD_NEXT, "getaddrinfo");
+    munit_assert_ptr(*(void **)&system_getaddrinfo, !=, NULL);
+    return (*system_getaddrinfo)(node, service, hints, res);
+}
+
+int getaddrinfo(const char *node,
+                const char *service,
+                const struct addrinfo *hints,
+                struct addrinfo **res)
+{
+    int rv;
+
+    if (!addrinfo_mock_enabled) {
+        return invoke_system_getaddrinfo(node, service, hints, res);
+    }
+    if (!addrinfo_data || addrinfo_data->state == SystemResult) {
+        /* We have not injected response, invoke system function */
+        rv = invoke_system_getaddrinfo(node, service, hints, res);
+        if (!rv) {
+            /* Store result for check on freeaddrinfo */
+            struct addrinfo_mock_data *response =
+                malloc(sizeof(struct addrinfo_mock_data));
+            munit_assert_ptr((void *)response, !=, NULL);
+            response->state = SystemResult;
+            response->rv = rv;
+            response->result = *res;
+            response->next = addrinfo_data;
+            addrinfo_data = response;
+        }
+        return rv;
+    }
+    if (addrinfo_data) {
+        munit_assert_int(addrinfo_data->state, ==, MockResultSet);
+        addrinfo_data->state = MockResultReturned;
+        rv = addrinfo_data->rv;
+        if (!rv) {
+            *res = addrinfo_data->result;
+        } else {
+            *res = NULL;
+            struct addrinfo_mock_data *response = addrinfo_data;
+            munit_assert_ptr((void *)response->result, ==, NULL);
+            addrinfo_data = response->next;
+            free(response);
+        }
+        return rv;
+    }
+    return EAI_FAIL;
+}
+
+static void invoke_system_freeaddrinfo(struct addrinfo *res)
+{
+    int (*system_freeaddrinfo)(struct addrinfo * res);
+    *(void **)(&system_freeaddrinfo) = dlsym(RTLD_NEXT, "freeaddrinfo");
+    munit_assert_ptr(*(void **)&system_freeaddrinfo, !=, NULL);
+    (*system_freeaddrinfo)(res);
+}
+
+void freeaddrinfo(struct addrinfo *res)
+{
+    struct addrinfo_mock_data **ptr;
+    struct addrinfo_mock_data *response;
+
+    // freeaddrinfo should not be invoked with a NULL pointer
+    munit_assert_ptr((void *)res, !=, NULL);
+
+    if (!addrinfo_mock_enabled) {
+        invoke_system_freeaddrinfo(res);
+        return;
+    }
+    for (ptr = &addrinfo_data; *ptr; ptr = &((*ptr)->next)) {
+        if ((*ptr)->result == res) {
+            break;
+        }
+    }
+    response = *ptr;
+    munit_assert_ptr((void *)response, !=, NULL);
+    *ptr = response->next;
+    if (response->state == SystemResult) {
+        invoke_system_freeaddrinfo(response->result);
+    } else {
+        munit_assert_int(response->state, ==, MockResultReturned);
+        res = response->result;
+        while (res) {
+            struct addrinfo *next = res->ai_next;
+            free(res->ai_addr);
+            free(res);
+            res = next;
+        }
+    }
+    free(response);
+}
diff --git a/test/raft/lib/addrinfo.h b/test/raft/lib/addrinfo.h
new file mode 100644
index 000000000..cc29d5864
--- /dev/null
+++ b/test/raft/lib/addrinfo.h
@@ -0,0 +1,35 @@
+/* Support for getaddrinfo injection for test purpose
+ *
+ * Provide a local bound version to capture teh getaddrinfo/freeaddrinfo
+ * incovation The helper may operate in three different modes: a) Transparent
+ * forward calls to system getaddrinfo/freeaddrinfo function, if the
+ * SET_UP_ADDRINFO/TEAR_DOWN_ADDRINFO is not added to the test test case setup
+ * teardown. b) Check, if all results requested by getaddrinfo are freed using
+ * freeaddrinfo. Activated by adding the SET_UP_ADDRINFO/SET_UP_ADDRINFO macros
+ * to the test fixture. c) Inject artifical responses into the the getaddrinfo
+ * requests for test purpose additionally to b) by using
+ * AddrinfoInjectSetResponse before triggering the getaddrinfo calls.
+ */
+
+#ifndef TEST_ADDRINFO_H
+#define TEST_ADDRINFO_H
+
+#include "munit.h"
+
+#define SET_UP_ADDRINFO AddrinfoInjectSetUp(params)
+#define TEAR_DOWN_ADDRINFO AddrinfoInjectTearDown()
+
+typedef struct AddrinfoResult
+{
+    const char *ip;
+    const int port;
+} AddrinfoResult_t;
+
+void AddrinfoInjectSetResponse(int rv,
+                               int num_results,
+                               const struct AddrinfoResult *results);
+
+void AddrinfoInjectSetUp(const MunitParameter params[]);
+void AddrinfoInjectTearDown(void);
+
+#endif  // #ifndef TEST_ADDRINFO_H
diff --git a/test/raft/lib/aio.c b/test/raft/lib/aio.c
new file mode 100644
index 000000000..c731b734b
--- /dev/null
+++ b/test/raft/lib/aio.c
@@ -0,0 +1,66 @@
+#include "aio.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "munit.h"
+
+int AioFill(aio_context_t *ctx, unsigned n)
+{
+    char buf[256];
+    int fd;
+    int rv;
+    int limit;
+    int used;
+
+    /* Figure out how many events are available. */
+    fd = open("/proc/sys/fs/aio-max-nr", O_RDONLY);
+    munit_assert_int(fd, !=, -1);
+
+    rv = read(fd, buf, sizeof buf);
+    munit_assert_int(rv, !=, -1);
+
+    close(fd);
+
+    limit = atoi(buf);
+    munit_assert_int(limit, >, 0);
+
+    /* Figure out how many events are in use. */
+    fd = open("/proc/sys/fs/aio-nr", O_RDONLY);
+    munit_assert_int(fd, !=, -1);
+
+    rv = read(fd, buf, sizeof buf);
+    munit_assert_int(rv, !=, -1);
+
+    close(fd);
+
+    used = atoi(buf);
+    munit_assert_int(used, >=, 0);
+
+    /* Best effort check that nothing process is using AIO. Our own unit tests
+     * case use up to 2 event slots at the time this function is called, so we
+     * don't consider those. */
+    if (used > 2) {
+        return -1;
+    }
+
+    rv = syscall(__NR_io_setup, limit - used - n, ctx);
+    if (rv != 0) {
+        /* The `limit - used - n` calculation is racy and io_setup can fail with
+         * EAGAIN if in meantime another proces has reserved some events */
+        munit_assert_int(errno, ==, EAGAIN);
+        return -1;
+    }
+
+    return 0;
+}
+
+void AioDestroy(aio_context_t ctx)
+{
+    int rv;
+
+    rv = syscall(__NR_io_destroy, ctx);
+    munit_assert_int(rv, ==, 0);
+}
diff --git a/test/raft/lib/aio.h b/test/raft/lib/aio.h
new file mode 100644
index 000000000..f4540e0f6
--- /dev/null
+++ b/test/raft/lib/aio.h
@@ -0,0 +1,19 @@
+/* Utilities around the Kernel AIO sub-system. */
+#ifndef TEST_AIO_H
+#define TEST_AIO_H
+
+#include <linux/aio_abi.h>
+
+/* Fill the AIO subsystem resources by allocating a lot of events to the given
+ * context, and leaving only @n events available for subsequent calls to
+ * @io_setup.
+ *
+ * Return -1 if it looks like there is another process already using the AIO
+ * subsystem, which would most probably make the calling test flaky because
+ * there won't be exactly @n events available anymore. */
+int AioFill(aio_context_t *ctx, unsigned n);
+
+/* Destroy the given AIO context. */
+void AioDestroy(aio_context_t ctx);
+
+#endif /* TEST_AIO_H */
diff --git a/test/raft/lib/cluster.c b/test/raft/lib/cluster.c
new file mode 100644
index 000000000..68190389d
--- /dev/null
+++ b/test/raft/lib/cluster.c
@@ -0,0 +1,45 @@
+#include "cluster.h"
+
+static void randomize(struct raft_fixture *f, unsigned i, int what)
+{
+    struct raft *raft = raft_fixture_get(f, i);
+    switch (what) {
+        case RAFT_FIXTURE_TICK:
+            /* TODO: provide an API to inspect how much time has elapsed since
+             * the last election timer reset */
+            if (raft->election_timer_start == raft->io->time(raft->io)) {
+                raft_fixture_set_randomized_election_timeout(
+                    f, i,
+                    munit_rand_int_range(raft->election_timeout,
+                                         raft->election_timeout * 2));
+            }
+            break;
+        case RAFT_FIXTURE_DISK:
+            raft_fixture_set_disk_latency(f, i, munit_rand_int_range(10, 25));
+            break;
+        case RAFT_FIXTURE_NETWORK:
+            raft_fixture_set_network_latency(f, i,
+                                             munit_rand_int_range(25, 50));
+            break;
+        default:
+            munit_assert(0);
+            break;
+    }
+}
+
+void cluster_randomize_init(struct raft_fixture *f)
+{
+    unsigned i;
+    for (i = 0; i < raft_fixture_n(f); i++) {
+        randomize(f, i, RAFT_FIXTURE_TICK);
+        randomize(f, i, RAFT_FIXTURE_DISK);
+        randomize(f, i, RAFT_FIXTURE_NETWORK);
+    }
+}
+
+void cluster_randomize(struct raft_fixture *f, struct raft_fixture_event *event)
+{
+    unsigned index = raft_fixture_event_server_index(event);
+    int type = raft_fixture_event_type(event);
+    randomize(f, index, type);
+}
diff --git a/test/raft/lib/cluster.h b/test/raft/lib/cluster.h
new file mode 100644
index 000000000..602424d17
--- /dev/null
+++ b/test/raft/lib/cluster.h
@@ -0,0 +1,436 @@
+/* Setup and drive a test raft cluster. */
+
+#ifndef TEST_CLUSTER_H
+#define TEST_CLUSTER_H
+
+#include <stdlib.h>
+
+#include "../../../src/raft.h"
+#include "fsm.h"
+#include "heap.h"
+#include "munit.h"
+#include "snapshot.h"
+
+#define FIXTURE_CLUSTER                             \
+    FIXTURE_HEAP;                                   \
+    struct raft_fsm fsms[RAFT_FIXTURE_MAX_SERVERS]; \
+    struct raft_fixture cluster
+
+/* N is the default number of servers, but can be tweaked with the cluster-n
+ * parameter. */
+#define SETUP_CLUSTER(DEFAULT_N)                                               \
+    SET_UP_HEAP;                                                               \
+    do {                                                                       \
+        unsigned _n = DEFAULT_N;                                               \
+        bool _pre_vote = false;                                                \
+        bool _ss_async = false;                                                \
+        int _fsm_version = 3;                                                  \
+        unsigned _hb = 0;                                                      \
+        unsigned _i;                                                           \
+        int _rv;                                                               \
+        if (munit_parameters_get(params, CLUSTER_N_PARAM) != NULL) {           \
+            _n = atoi(munit_parameters_get(params, CLUSTER_N_PARAM));          \
+        }                                                                      \
+        if (munit_parameters_get(params, CLUSTER_PRE_VOTE_PARAM) != NULL) {    \
+            _pre_vote =                                                        \
+                atoi(munit_parameters_get(params, CLUSTER_PRE_VOTE_PARAM));    \
+        }                                                                      \
+        if (munit_parameters_get(params, CLUSTER_HEARTBEAT_PARAM) != NULL) {   \
+            _hb = atoi(munit_parameters_get(params, CLUSTER_HEARTBEAT_PARAM)); \
+        }                                                                      \
+        if (munit_parameters_get(params, CLUSTER_SS_ASYNC_PARAM) != NULL) {    \
+            _ss_async =                                                        \
+                atoi(munit_parameters_get(params, CLUSTER_SS_ASYNC_PARAM));    \
+        }                                                                      \
+        if (munit_parameters_get(params, CLUSTER_FSM_VERSION_PARAM) != NULL) { \
+            _fsm_version =                                                     \
+                atoi(munit_parameters_get(params, CLUSTER_FSM_VERSION_PARAM)); \
+        }                                                                      \
+        munit_assert_int(_n, >, 0);                                            \
+        _rv = raft_fixture_init(&f->cluster);                                  \
+        munit_assert_int(_rv, ==, 0);                                          \
+        for (_i = 0; _i < _n; _i++) {                                          \
+            if (!_ss_async || _fsm_version < 3) {                              \
+                FsmInit(&f->fsms[_i], _fsm_version);                           \
+            } else {                                                           \
+                FsmInitAsync(&f->fsms[_i], _fsm_version);                      \
+            }                                                                  \
+            _rv = raft_fixture_grow(&f->cluster, &f->fsms[_i]);                \
+            munit_assert_int(_rv, ==, 0);                                      \
+        }                                                                      \
+        for (_i = 0; _i < _n; _i++) {                                          \
+            raft_set_pre_vote(raft_fixture_get(&f->cluster, _i), _pre_vote);   \
+            if (_hb) {                                                         \
+                raft_set_heartbeat_timeout(raft_fixture_get(&f->cluster, _i),  \
+                                           _hb);                               \
+            }                                                                  \
+        }                                                                      \
+    } while (0)
+
+#define TEAR_DOWN_CLUSTER                 \
+    do {                                  \
+        unsigned i;                       \
+        raft_fixture_close(&f->cluster);  \
+        for (i = 0; i < CLUSTER_N; i++) { \
+            FsmClose(&f->fsms[i]);        \
+        }                                 \
+    } while (0);                          \
+    TEAR_DOWN_HEAP;
+
+/* Munit parameter for setting the number of servers */
+#define CLUSTER_N_PARAM "cluster-n"
+
+/* Munit parameter for setting the number of voting servers */
+#define CLUSTER_N_VOTING_PARAM "cluster-n-voting"
+
+/* Munit parameter for enabling pre-vote */
+#define CLUSTER_PRE_VOTE_PARAM "cluster-pre-vote"
+
+/* Munit parameter for setting HeartBeat timeout */
+#define CLUSTER_HEARTBEAT_PARAM "cluster-heartbeat"
+
+/* Munit parameter for setting snapshot behaviour */
+#define CLUSTER_SS_ASYNC_PARAM "cluster-snapshot-async"
+
+/* Munit parameter for setting fsm version */
+#define CLUSTER_FSM_VERSION_PARAM "fsm-version"
+
+/* Get the number of servers in the cluster. */
+#define CLUSTER_N raft_fixture_n(&f->cluster)
+
+/* Get the cluster time. */
+#define CLUSTER_TIME raft_fixture_time(&f->cluster)
+
+/* Index of the current leader, or CLUSTER_N if there's no leader. */
+#define CLUSTER_LEADER raft_fixture_leader_index(&f->cluster)
+
+/* True if the cluster has a leader. */
+#define CLUSTER_HAS_LEADER CLUSTER_LEADER < CLUSTER_N
+
+/* Get the struct raft object of the I'th server. */
+#define CLUSTER_RAFT(I) raft_fixture_get(&f->cluster, I)
+
+/* Get the state of the I'th server. */
+#define CLUSTER_STATE(I) raft_state(raft_fixture_get(&f->cluster, I))
+
+/* Get the current term of the I'th server. */
+#define CLUSTER_TERM(I) raft_fixture_get(&f->cluster, I)->current_term
+
+/* Get the struct fsm object of the I'th server. */
+#define CLUSTER_FSM(I) &f->fsms[I]
+
+/* Return the last applied index on the I'th server. */
+#define CLUSTER_LAST_APPLIED(I) \
+    raft_last_applied(raft_fixture_get(&f->cluster, I))
+
+/* Return the ID of the server the I'th server has voted for. */
+#define CLUSTER_VOTED_FOR(I) raft_fixture_voted_for(&f->cluster, I)
+
+/* Return a description of the last error occurred on the I'th server. */
+#define CLUSTER_ERRMSG(I) raft_errmsg(CLUSTER_RAFT(I))
+
+/* Populate the given configuration with all servers in the fixture. All servers
+ * will be voting. */
+#define CLUSTER_CONFIGURATION(CONF)                                     \
+    {                                                                   \
+        int rv_;                                                        \
+        rv_ = raft_fixture_configuration(&f->cluster, CLUSTER_N, CONF); \
+        munit_assert_int(rv_, ==, 0);                                   \
+    }
+
+/* Bootstrap all servers in the cluster. All servers will be voting, unless the
+ * cluster-n-voting parameter is used. */
+#define CLUSTER_BOOTSTRAP                                                    \
+    {                                                                        \
+        unsigned n_ = CLUSTER_N;                                             \
+        int rv_;                                                             \
+        struct raft_configuration configuration;                             \
+        if (munit_parameters_get(params, CLUSTER_N_VOTING_PARAM) != NULL) {  \
+            n_ = atoi(munit_parameters_get(params, CLUSTER_N_VOTING_PARAM)); \
+        }                                                                    \
+        rv_ = raft_fixture_configuration(&f->cluster, n_, &configuration);   \
+        munit_assert_int(rv_, ==, 0);                                        \
+        rv_ = raft_fixture_bootstrap(&f->cluster, &configuration);           \
+        munit_assert_int(rv_, ==, 0);                                        \
+        raft_configuration_close(&configuration);                            \
+    }
+
+/* Bootstrap all servers in the cluster. Only the first N servers will be
+ * voting. */
+#define CLUSTER_BOOTSTRAP_N_VOTING(N)                                      \
+    {                                                                      \
+        int rv_;                                                           \
+        struct raft_configuration configuration_;                          \
+        rv_ = raft_fixture_configuration(&f->cluster, N, &configuration_); \
+        munit_assert_int(rv_, ==, 0);                                      \
+        rv_ = raft_fixture_bootstrap(&f->cluster, &configuration_);        \
+        munit_assert_int(rv_, ==, 0);                                      \
+        raft_configuration_close(&configuration_);                         \
+    }
+
+/* Start all servers in the test cluster. */
+#define CLUSTER_START                         \
+    {                                         \
+        int rc;                               \
+        rc = raft_fixture_start(&f->cluster); \
+        munit_assert_int(rc, ==, 0);          \
+    }
+
+/* Step the cluster. */
+#define CLUSTER_STEP raft_fixture_step(&f->cluster);
+
+/* Step the cluster N times. */
+#define CLUSTER_STEP_N(N)                   \
+    {                                       \
+        unsigned i_;                        \
+        for (i_ = 0; i_ < N; i_++) {        \
+            raft_fixture_step(&f->cluster); \
+        }                                   \
+    }
+
+/* Step until the given function becomes true. */
+#define CLUSTER_STEP_UNTIL(FUNC, ARG, MSECS)                            \
+    {                                                                   \
+        bool done_;                                                     \
+        done_ = raft_fixture_step_until(&f->cluster, FUNC, ARG, MSECS); \
+        munit_assert_true(done_);                                       \
+    }
+
+/* Step the cluster until a leader is elected or #MAX_MSECS have elapsed. */
+#define CLUSTER_STEP_UNTIL_ELAPSED(MSECS) \
+    raft_fixture_step_until_elapsed(&f->cluster, MSECS)
+
+/* Step the cluster until a leader is elected or #MAX_MSECS have elapsed. */
+#define CLUSTER_STEP_UNTIL_HAS_LEADER(MAX_MSECS)                           \
+    {                                                                      \
+        bool done;                                                         \
+        done = raft_fixture_step_until_has_leader(&f->cluster, MAX_MSECS); \
+        munit_assert_true(done);                                           \
+        munit_assert_true(CLUSTER_HAS_LEADER);                             \
+    }
+
+/* Step the cluster until there's no leader or #MAX_MSECS have elapsed. */
+#define CLUSTER_STEP_UNTIL_HAS_NO_LEADER(MAX_MSECS)                           \
+    {                                                                         \
+        bool done;                                                            \
+        done = raft_fixture_step_until_has_no_leader(&f->cluster, MAX_MSECS); \
+        munit_assert_true(done);                                              \
+        munit_assert_false(CLUSTER_HAS_LEADER);                               \
+    }
+
+/* Step the cluster until the given index was applied by the given server (or
+ * all if N) or #MAX_MSECS have elapsed. */
+#define CLUSTER_STEP_UNTIL_APPLIED(I, INDEX, MAX_MSECS)                        \
+    {                                                                          \
+        bool done;                                                             \
+        done =                                                                 \
+            raft_fixture_step_until_applied(&f->cluster, I, INDEX, MAX_MSECS); \
+        munit_assert_true(done);                                               \
+    }
+
+/* Step the cluster until the state of the server with the given index matches
+ * the given value, or #MAX_MSECS have elapsed. */
+#define CLUSTER_STEP_UNTIL_STATE_IS(I, STATE, MAX_MSECS)               \
+    {                                                                  \
+        bool done;                                                     \
+        done = raft_fixture_step_until_state_is(&f->cluster, I, STATE, \
+                                                MAX_MSECS);            \
+        munit_assert_true(done);                                       \
+    }
+
+/* Step the cluster until the term of the server with the given index matches
+ * the given value, or #MAX_MSECS have elapsed. */
+#define CLUSTER_STEP_UNTIL_TERM_IS(I, TERM, MAX_MSECS)                        \
+    {                                                                         \
+        bool done;                                                            \
+        done =                                                                \
+            raft_fixture_step_until_term_is(&f->cluster, I, TERM, MAX_MSECS); \
+        munit_assert_true(done);                                              \
+    }
+
+/* Step the cluster until server I has voted for server J, or #MAX_MSECS have
+ * elapsed. */
+#define CLUSTER_STEP_UNTIL_VOTED_FOR(I, J, MAX_MSECS)                        \
+    {                                                                        \
+        bool done;                                                           \
+        done =                                                               \
+            raft_fixture_step_until_voted_for(&f->cluster, I, J, MAX_MSECS); \
+        munit_assert_true(done);                                             \
+    }
+
+/* Step the cluster until all messages from server I to server J have been
+ * delivered, or #MAX_MSECS elapse. */
+#define CLUSTER_STEP_UNTIL_DELIVERED(I, J, MAX_MSECS)                        \
+    {                                                                        \
+        bool done;                                                           \
+        done =                                                               \
+            raft_fixture_step_until_delivered(&f->cluster, I, J, MAX_MSECS); \
+        munit_assert_true(done);                                             \
+    }
+
+/* Request to apply an FSM command to add the given value to x. */
+#define CLUSTER_APPLY_ADD_X(I, REQ, VALUE, CB)      \
+    {                                               \
+        struct raft_buffer buf_;                    \
+        struct raft *raft_;                         \
+        int rv_;                                    \
+        FsmEncodeAddX(VALUE, &buf_);                \
+        raft_ = raft_fixture_get(&f->cluster, I);   \
+        rv_ = raft_apply(raft_, REQ, &buf_, 1, CB); \
+        munit_assert_int(rv_, ==, 0);               \
+    }
+
+/* Kill the I'th server. */
+#define CLUSTER_KILL(I) raft_fixture_kill(&f->cluster, I);
+
+/* Revive the I'th server */
+#define CLUSTER_REVIVE(I) raft_fixture_revive(&f->cluster, I);
+
+/* Kill the leader. */
+#define CLUSTER_KILL_LEADER CLUSTER_KILL(CLUSTER_LEADER)
+
+/* Kill a majority of servers, except the leader (if there is one). */
+#define CLUSTER_KILL_MAJORITY                                \
+    {                                                        \
+        size_t i2;                                           \
+        size_t n;                                            \
+        for (i2 = 0, n = 0; n < (CLUSTER_N / 2) + 1; i2++) { \
+            if (i2 == CLUSTER_LEADER) {                      \
+                continue;                                    \
+            }                                                \
+            CLUSTER_KILL(i2)                                 \
+            n++;                                             \
+        }                                                    \
+    }
+
+/* Grow the cluster adding one server. */
+#define CLUSTER_GROW                                               \
+    {                                                              \
+        int rv_;                                                   \
+        FsmInit(&f->fsms[CLUSTER_N], 2);                           \
+        rv_ = raft_fixture_grow(&f->cluster, &f->fsms[CLUSTER_N]); \
+        munit_assert_int(rv_, ==, 0);                              \
+    }
+
+/* Add a new pristine server to the cluster, connected to all others. Then
+ * submit a request to add it to the configuration as an idle server. */
+#define CLUSTER_ADD(REQ)                                               \
+    {                                                                  \
+        int rc;                                                        \
+        struct raft *new_raft;                                         \
+        CLUSTER_GROW;                                                  \
+        rc = raft_start(CLUSTER_RAFT(CLUSTER_N - 1));                  \
+        munit_assert_int(rc, ==, 0);                                   \
+        new_raft = CLUSTER_RAFT(CLUSTER_N - 1);                        \
+        rc = raft_add(CLUSTER_RAFT(CLUSTER_LEADER), REQ, new_raft->id, \
+                      new_raft->address, NULL);                        \
+        munit_assert_int(rc, ==, 0);                                   \
+    }
+
+/* Assign the given role to the server that was added last. */
+#define CLUSTER_ASSIGN(REQ, ROLE)                                              \
+    do {                                                                       \
+        unsigned _id;                                                          \
+        int _rv;                                                               \
+        _id = CLUSTER_N; /* Last server that was added. */                     \
+        _rv = raft_assign(CLUSTER_RAFT(CLUSTER_LEADER), REQ, _id, ROLE, NULL); \
+        munit_assert_int(_rv, ==, 0);                                          \
+    } while (0)
+
+/* Ensure that the cluster can make progress from the current state.
+ *
+ * - If no leader is present, wait for one to be elected.
+ * - Submit a request to apply a new FSM command and wait for it to complete. */
+#define CLUSTER_MAKE_PROGRESS                                          \
+    {                                                                  \
+        struct raft_apply *req_ = munit_malloc(sizeof *req_);          \
+        if (!(CLUSTER_HAS_LEADER)) {                                   \
+            CLUSTER_STEP_UNTIL_HAS_LEADER(10000);                      \
+        }                                                              \
+        CLUSTER_APPLY_ADD_X(CLUSTER_LEADER, req_, 1, NULL);            \
+        CLUSTER_STEP_UNTIL_APPLIED(CLUSTER_LEADER, req_->index, 3000); \
+        free(req_);                                                    \
+    }
+
+/* Elect the I'th server. */
+#define CLUSTER_ELECT(I) raft_fixture_elect(&f->cluster, I)
+
+/* Start to elect the I'th server. */
+#define CLUSTER_START_ELECT(I) raft_fixture_start_elect(&f->cluster, I)
+
+/* Depose the current leader */
+#define CLUSTER_DEPOSE raft_fixture_depose(&f->cluster)
+
+/* Disconnect I from J. */
+#define CLUSTER_DISCONNECT(I, J) raft_fixture_disconnect(&f->cluster, I, J)
+
+/* Reconnect I to J. */
+#define CLUSTER_RECONNECT(I, J) raft_fixture_reconnect(&f->cluster, I, J)
+
+/* Saturate the connection from I to J. */
+#define CLUSTER_SATURATE(I, J) raft_fixture_saturate(&f->cluster, I, J)
+
+/* Saturate the connection from I to J and from J to I, in both directions. */
+#define CLUSTER_SATURATE_BOTHWAYS(I, J) \
+    CLUSTER_SATURATE(I, J);             \
+    CLUSTER_SATURATE(J, I)
+
+/* Desaturate the connection between I and J, making messages flow again. */
+#define CLUSTER_DESATURATE(I, J) raft_fixture_desaturate(&f->cluster, I, J)
+
+/* Reconnect two servers. */
+#define CLUSTER_DESATURATE_BOTHWAYS(I, J) \
+    CLUSTER_DESATURATE(I, J);             \
+    CLUSTER_DESATURATE(J, I)
+
+/* Set the network latency of outgoing messages of server I. */
+#define CLUSTER_SET_NETWORK_LATENCY(I, MSECS) \
+    raft_fixture_set_network_latency(&f->cluster, I, MSECS)
+
+/* Set the disk I/O latency of server I. */
+#define CLUSTER_SET_DISK_LATENCY(I, MSECS) \
+    raft_fixture_set_disk_latency(&f->cluster, I, MSECS)
+
+/* Set the term persisted on the I'th server. This must be called before
+ * starting the cluster. */
+#define CLUSTER_SET_TERM(I, TERM) raft_fixture_set_term(&f->cluster, I, TERM)
+
+/* Set the snapshot persisted on the I'th server. This must be called before
+ * starting the cluster. */
+#define CLUSTER_SET_SNAPSHOT(I, LAST_INDEX, LAST_TERM, CONF_INDEX, X, Y)  \
+    {                                                                     \
+        struct raft_configuration configuration_;                         \
+        struct raft_snapshot *snapshot_;                                  \
+        CLUSTER_CONFIGURATION(&configuration_);                           \
+        CREATE_SNAPSHOT(snapshot_, LAST_INDEX, LAST_TERM, configuration_, \
+                        CONF_INDEX, X, Y);                                \
+        raft_fixture_set_snapshot(&f->cluster, I, snapshot_);             \
+    }
+
+/* Add a persisted entry to the I'th server. This must be called before
+ * starting the cluster. */
+#define CLUSTER_ADD_ENTRY(I, ENTRY) \
+    raft_fixture_add_entry(&f->cluster, I, ENTRY)
+
+/* Add an entry to the ones persisted on the I'th server. This must be called
+ * before starting the cluster. */
+#define CLUSTER_ADD_ENTRY(I, ENTRY) \
+    raft_fixture_add_entry(&f->cluster, I, ENTRY)
+
+/* Return the number of messages sent by the given server. */
+#define CLUSTER_N_SEND(I, TYPE) raft_fixture_n_send(&f->cluster, I, TYPE)
+
+/* Return the number of messages sent by the given server. */
+#define CLUSTER_N_RECV(I, TYPE) raft_fixture_n_recv(&f->cluster, I, TYPE)
+
+/* Set a fixture hook that randomizes election timeouts, disk latency and
+ * network latency. */
+#define CLUSTER_RANDOMIZE                \
+    cluster_randomize_init(&f->cluster); \
+    raft_fixture_hook(&f->cluster, cluster_randomize)
+
+void cluster_randomize_init(struct raft_fixture *f);
+void cluster_randomize(struct raft_fixture *f,
+                       struct raft_fixture_event *event);
+
+#endif /* TEST_CLUSTER_H */
diff --git a/test/raft/lib/dir.c b/test/raft/lib/dir.c
new file mode 100644
index 000000000..a2c8f1d36
--- /dev/null
+++ b/test/raft/lib/dir.c
@@ -0,0 +1,423 @@
+#include "dir.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <ftw.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#define SEP "/"
+#define TEMPLATE "raft-test-XXXXXX"
+
+#define TEST_DIR_TEMPLATE "./tmp/%s/raft-test-XXXXXX"
+
+static char *dirAll[] = {"tmpfs", "ext4", "btrfs", "xfs", "zfs", NULL};
+
+static char *dirTmpfs[] = {"tmpfs", NULL};
+
+static char *dirAio[] = {"btrfs", "ext4", "xfs", NULL};
+
+static char *dirNoAio[] = {"tmpfs", "zfs", NULL};
+
+MunitParameterEnum DirTmpfsParams[] = {
+    {DIR_FS_PARAM, dirTmpfs},
+    {NULL, NULL},
+};
+
+MunitParameterEnum DirAllParams[] = {
+    {DIR_FS_PARAM, dirAll},
+    {NULL, NULL},
+};
+
+MunitParameterEnum DirAioParams[] = {
+    {DIR_FS_PARAM, dirAio},
+    {NULL, NULL},
+};
+
+MunitParameterEnum DirNoAioParams[] = {
+    {DIR_FS_PARAM, dirNoAio},
+    {NULL, NULL},
+};
+
+/* Create a temporary directory in the given parent directory. */
+static char *dirMakeTemp(const char *parent)
+{
+    char *dir;
+    if (parent == NULL) {
+        return NULL;
+    }
+    dir = munit_malloc(strlen(parent) + strlen(SEP) + strlen(TEMPLATE) + 1);
+    sprintf(dir, "%s%s%s", parent, SEP, TEMPLATE);
+    if (mkdtemp(dir) == NULL) {
+        munit_error(strerror(errno));
+    }
+    return dir;
+}
+
+void *DirSetUp(MUNIT_UNUSED const MunitParameter params[],
+               MUNIT_UNUSED void *user_data)
+{
+    const char *fs = munit_parameters_get(params, DIR_FS_PARAM);
+    if (fs == NULL) {
+        return dirMakeTemp("/tmp");
+    } else if (strcmp(fs, "tmpfs") == 0) {
+        return DirTmpfsSetUp(params, user_data);
+    } else if (strcmp(fs, "ext4") == 0) {
+        return DirExt4SetUp(params, user_data);
+    } else if (strcmp(fs, "btrfs") == 0) {
+        return DirBtrfsSetUp(params, user_data);
+    } else if (strcmp(fs, "zfs") == 0) {
+        return DirZfsSetUp(params, user_data);
+    } else if (strcmp(fs, "xfs") == 0) {
+        return DirXfsSetUp(params, user_data);
+    }
+    munit_errorf("Unsupported file system %s", fs);
+    return NULL;
+}
+
+void *DirTmpfsSetUp(MUNIT_UNUSED const MunitParameter params[],
+                    MUNIT_UNUSED void *user_data)
+{
+    return dirMakeTemp(getenv("RAFT_TMP_TMPFS"));
+}
+
+void *DirExt4SetUp(MUNIT_UNUSED const MunitParameter params[],
+                   MUNIT_UNUSED void *user_data)
+{
+    return dirMakeTemp(getenv("RAFT_TMP_EXT4"));
+}
+
+void *DirBtrfsSetUp(MUNIT_UNUSED const MunitParameter params[],
+                    MUNIT_UNUSED void *user_data)
+{
+    return dirMakeTemp(getenv("RAFT_TMP_BTRFS"));
+}
+
+void *DirZfsSetUp(MUNIT_UNUSED const MunitParameter params[],
+                  MUNIT_UNUSED void *user_data)
+{
+    return dirMakeTemp(getenv("RAFT_TMP_ZFS"));
+}
+
+void *DirXfsSetUp(MUNIT_UNUSED const MunitParameter params[],
+                  MUNIT_UNUSED void *user_data)
+{
+    return dirMakeTemp(getenv("RAFT_TMP_XFS"));
+}
+
+/* Wrapper around remove(), compatible with ntfw. */
+static int dirRemoveFn(const char *path,
+                       MUNIT_UNUSED const struct stat *sbuf,
+                       MUNIT_UNUSED int type,
+                       MUNIT_UNUSED struct FTW *ftwb)
+{
+    return remove(path);
+}
+
+static void dirRemove(char *dir)
+{
+    int rv;
+    rv = chmod(dir, 0755);
+    munit_assert_int(rv, ==, 0);
+
+    rv = nftw(dir, dirRemoveFn, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
+    munit_assert_int(rv, ==, 0);
+}
+
+static bool dirExists(const char *dir)
+{
+    struct stat sb;
+    int rv;
+
+    rv = stat(dir, &sb);
+    if (rv == -1) {
+        munit_assert_int(errno, ==, ENOENT);
+        return false;
+    }
+
+    return true;
+}
+
+void DirTearDown(void *data)
+{
+    char *dir = data;
+    if (dir == NULL) {
+        return;
+    }
+    if (dirExists(dir)) {
+        dirRemove(dir);
+    }
+    free(dir);
+}
+
+/* Join the given @dir and @filename into @path. */
+static void joinPath(const char *dir, const char *filename, char *path)
+{
+    strcpy(path, dir);
+    strcat(path, "/");
+    strcat(path, filename);
+}
+
+void DirWriteFile(const char *dir,
+                  const char *filename,
+                  const void *buf,
+                  const size_t n)
+{
+    char path[256];
+    int fd;
+    int rv;
+
+    joinPath(dir, filename, path);
+
+    fd = open(path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+    munit_assert_int(fd, !=, -1);
+
+    rv = write(fd, buf, n);
+    munit_assert_int(rv, ==, n);
+
+    close(fd);
+}
+
+void DirWriteFileWithZeros(const char *dir,
+                           const char *filename,
+                           const size_t n)
+{
+    void *buf = munit_malloc(n);
+
+    DirWriteFile(dir, filename, buf, n);
+
+    free(buf);
+}
+
+void DirOverwriteFile(const char *dir,
+                      const char *filename,
+                      const void *buf,
+                      const size_t n,
+                      const off_t whence)
+{
+    char path[256];
+    int fd;
+    int rv;
+    off_t size;
+
+    joinPath(dir, filename, path);
+
+    fd = open(path, O_RDWR, S_IRUSR | S_IWUSR);
+
+    munit_assert_int(fd, !=, -1);
+
+    /* Get the size of the file */
+    size = lseek(fd, 0, SEEK_END);
+
+    if (whence == 0) {
+        munit_assert_int(size, >=, n);
+        lseek(fd, 0, SEEK_SET);
+    } else if (whence > 0) {
+        munit_assert_int(whence, <=, size);
+        munit_assert_int(size - whence, >=, n);
+        lseek(fd, whence, SEEK_SET);
+    } else {
+        munit_assert_int(-whence, <=, size);
+        munit_assert_int(-whence, >=, n);
+        lseek(fd, whence, SEEK_END);
+    }
+
+    rv = write(fd, buf, n);
+    munit_assert_int(rv, ==, n);
+
+    close(fd);
+}
+
+void DirTruncateFile(const char *dir, const char *filename, const size_t n)
+{
+    char path[256];
+    int fd;
+    int rv;
+
+    joinPath(dir, filename, path);
+
+    fd = open(path, O_RDWR, S_IRUSR | S_IWUSR);
+    munit_assert_int(fd, !=, -1);
+
+    rv = ftruncate(fd, n);
+    munit_assert_int(rv, ==, 0);
+
+    rv = close(fd);
+    munit_assert_int(rv, ==, 0);
+}
+
+void DirGrowFile(const char *dir, const char *filename, const size_t n)
+{
+    char path[256];
+    int fd;
+    struct stat sb;
+    void *buf;
+    size_t size;
+    int rv;
+
+    joinPath(dir, filename, path);
+
+    fd = open(path, O_RDWR, S_IRUSR | S_IWUSR);
+    munit_assert_int(fd, !=, -1);
+
+    rv = fstat(fd, &sb);
+    munit_assert_int(rv, ==, 0);
+    munit_assert_int(sb.st_size, <=, n);
+
+    /* Fill with zeros. */
+    lseek(fd, sb.st_size, SEEK_SET);
+    size = n - sb.st_size;
+    buf = munit_malloc(size);
+    rv = write(fd, buf, size);
+    munit_assert_int(rv, ==, size);
+    free(buf);
+
+    rv = close(fd);
+    munit_assert_int(rv, ==, 0);
+}
+
+void DirRenameFile(const char *dir,
+                   const char *filename1,
+                   const char *filename2)
+{
+    char path1[256];
+    char path2[256];
+    int rv;
+
+    joinPath(dir, filename1, path1);
+    joinPath(dir, filename2, path2);
+
+    rv = rename(path1, path2);
+    munit_assert_int(rv, ==, 0);
+}
+
+void DirRemoveFile(const char *dir, const char *filename)
+{
+    char path[256];
+    int rv;
+
+    joinPath(dir, filename, path);
+    rv = unlink(path);
+    munit_assert_int(rv, ==, 0);
+}
+
+void DirReadFile(const char *dir,
+                 const char *filename,
+                 void *buf,
+                 const size_t n)
+{
+    char path[256];
+    int fd;
+    int rv;
+
+    joinPath(dir, filename, path);
+
+    fd = open(path, O_RDONLY);
+    if (fd == -1) {
+        munit_logf(MUNIT_LOG_ERROR, "read file '%s': %s", path,
+                   strerror(errno));
+    }
+
+    rv = read(fd, buf, n);
+    munit_assert_int(rv, ==, n);
+
+    close(fd);
+}
+
+void DirMakeUnexecutable(const char *dir)
+{
+    int rv;
+
+    rv = chmod(dir, 0);
+    munit_assert_int(rv, ==, 0);
+}
+
+void DirMakeUnwritable(const char *dir)
+{
+    int rv;
+
+    rv = chmod(dir, 0500);
+    munit_assert_int(rv, ==, 0);
+}
+
+void DirMakeFileUnreadable(const char *dir, const char *filename)
+{
+    char path[256];
+    int rv;
+
+    joinPath(dir, filename, path);
+
+    rv = chmod(path, 0);
+    munit_assert_int(rv, ==, 0);
+}
+
+bool DirHasFile(const char *dir, const char *filename)
+{
+    char path[256];
+    int fd;
+
+    joinPath(dir, filename, path);
+
+    fd = open(path, O_RDONLY);
+    if (fd == -1) {
+        munit_assert_true(errno == ENOENT || errno == EACCES);
+        return false;
+    }
+
+    close(fd);
+
+    return true;
+}
+
+void DirFill(const char *dir, const size_t n)
+{
+    char path[256];
+    const char *filename = ".fill";
+    struct statvfs fs;
+    size_t size;
+    int fd;
+    int rv;
+
+    rv = statvfs(dir, &fs);
+    munit_assert_int(rv, ==, 0);
+
+    size = fs.f_bsize * fs.f_bavail;
+
+    if (n > 0) {
+        munit_assert_int(size, >=, n);
+    }
+
+    joinPath(dir, filename, path);
+
+    fd = open(path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+    munit_assert_int(fd, !=, -1);
+
+    rv = posix_fallocate(fd, 0, size - n);
+    munit_assert_int(rv, ==, 0);
+
+    /* If n is zero, make sure any further write fails with ENOSPC */
+    if (n == 0) {
+        char buf[4096];
+        int i;
+
+        rv = lseek(fd, 0, SEEK_END);
+        munit_assert_int(rv, !=, -1);
+
+        for (i = 0; i < 40; i++) {
+            rv = write(fd, buf, sizeof buf);
+            if (rv < 0) {
+                break;
+            }
+        }
+
+        munit_assert_int(rv, ==, -1);
+        munit_assert_int(errno, ==, ENOSPC);
+    }
+
+    close(fd);
+}
diff --git a/test/raft/lib/dir.h b/test/raft/lib/dir.h
new file mode 100644
index 000000000..7980e6c1b
--- /dev/null
+++ b/test/raft/lib/dir.h
@@ -0,0 +1,142 @@
+/* Test directory utilities.
+ *
+ * This module sports helpers to create temporary directories backed by various
+ * file systems, read/write files in them, check for the presence of files
+ * etc. */
+
+#ifndef TEST_DIR_H
+#define TEST_DIR_H
+
+#include <sys/types.h>
+
+#include "munit.h"
+
+/* Munit parameter defining the file system type backing the temporary directory
+ * created by test_dir_setup().
+ *
+ * The various file systems must have been previously setup with the fs.sh
+ * script. */
+#define DIR_FS_PARAM "dir-fs"
+
+#define FIXTURE_DIR char *dir
+#define SET_UP_DIR                                                      \
+    f->dir = DirSetUp(params, user_data);                               \
+    if (f->dir == NULL) { /* Fs not available, test must be skipped. */ \
+        free(f);                                                        \
+        return NULL;                                                    \
+    }
+#define TEAR_DOWN_DIR DirTearDown(f->dir)
+
+/* Contain a single DIR_FS_PARAM parameter set to all supported file system
+ * types. */
+extern MunitParameterEnum DirAllParams[];
+
+/* Contain a single DIR_FS_PARAM parameter set to tmpfs. */
+extern MunitParameterEnum DirTmpfsParams[];
+
+/* Contain a single DIR_FS_PARAM parameter set to all file systems with
+ * proper AIO support (i.e. NOWAIT works). */
+extern MunitParameterEnum DirAioParams[];
+
+/* Contain a single DIR_FS_PARAM parameter set to all file systems without
+ * proper AIO support (i.e. NOWAIT does not work). */
+extern MunitParameterEnum DirNoAioParams[];
+
+/* Create a temporary test directory.
+ *
+ * Return a pointer the path of the created directory. */
+void *DirSetUp(const MunitParameter params[], void *user_data);
+
+/* Create a temporary test directory backed by tmpfs.
+ *
+ * Return a pointer the path of the created directory, or NULL if no tmpfs file
+ * system is available. */
+void *DirTmpfsSetUp(const MunitParameter params[], void *user_data);
+
+/* Create a temporary test directory backed by ext4.
+ *
+ * Return a pointer the path of the created directory, or NULL if no ext4 file
+ * system is available. */
+void *DirExt4SetUp(const MunitParameter params[], void *user_data);
+
+/* Create a temporary test directory backed by btrfs.
+ *
+ * Return a pointer the path of the created directory, or NULL if no btrfs file
+ * system is available. */
+void *DirBtrfsSetUp(const MunitParameter params[], void *user_data);
+
+/* Create a temporary test directory backed by zfs.
+ *
+ * Return a pointer the path of the created directory, or NULL if no zfs file
+ * system is available. */
+void *DirZfsSetUp(const MunitParameter params[], void *user_data);
+
+/* Create a temporary test directory backed by xfs.
+ *
+ * Return a pointer the path of the created directory, or NULL if no xfs file
+ * system is available. */
+void *DirXfsSetUp(const MunitParameter params[], void *user_data);
+
+/* Recursively remove a temporary directory. */
+void DirTearDown(void *data);
+
+/* Write the given @buf to the given @filename in the given @dir. */
+void DirWriteFile(const char *dir,
+                  const char *filename,
+                  const void *buf,
+                  const size_t n);
+
+/* Write the given @filename and fill it with zeros. */
+void DirWriteFileWithZeros(const char *dir,
+                           const char *filename,
+                           const size_t n);
+
+/* Overwrite @n bytes of the given file with the given @buf data.
+ *
+ * If @whence is zero, overwrite the first @n bytes of the file. If @whence is
+ * positive overwrite the @n bytes starting at offset @whence. If @whence is
+ * negative overwrite @n bytes starting at @whence bytes from the end of the
+ * file. */
+void DirOverwriteFile(const char *dir,
+                      const char *filename,
+                      const void *buf,
+                      const size_t n,
+                      const off_t whence);
+
+/* Truncate the given file, leaving only the first @n bytes. */
+void DirTruncateFile(const char *dir, const char *filename, const size_t n);
+
+/* Grow the given file to the given size, filling the new bytes with zeros. */
+void DirGrowFile(const char *dir, const char *filename, const size_t n);
+
+/* Rename a file in the given directory from filename1 to filename2. */
+void DirRenameFile(const char *dir,
+                   const char *filename1,
+                   const char *filename2);
+
+/* Remove a file. */
+void DirRemoveFile(const char *dir, const char *filename);
+
+/* Read into @buf the content of the given @filename in the given @dir. */
+void DirReadFile(const char *dir,
+                 const char *filename,
+                 void *buf,
+                 const size_t n);
+
+/* Make the given directory not executable, so files can't be open. */
+void DirMakeUnexecutable(const char *dir);
+
+/* Make the given directory not writable. */
+void DirMakeUnwritable(const char *dir);
+
+/* Make the given file not readable. */
+void DirMakeFileUnreadable(const char *dir, const char *filename);
+
+/* Check if the given directory has the given file. */
+bool DirHasFile(const char *dir, const char *filename);
+
+/* Fill the underlying file system of the given dir, leaving only n bytes free.
+ */
+void DirFill(const char *dir, const size_t n);
+
+#endif /* TEST_DIR_H */
diff --git a/test/raft/lib/fault.c b/test/raft/lib/fault.c
new file mode 100644
index 000000000..197c3adc1
--- /dev/null
+++ b/test/raft/lib/fault.c
@@ -0,0 +1,69 @@
+#include "fault.h"
+
+#include "munit.h"
+
+void FaultInit(struct Fault *f)
+{
+    f->countdown = -1;
+    f->n = -1;
+    f->paused = false;
+}
+
+bool FaultTick(struct Fault *f)
+{
+    if (MUNIT_UNLIKELY(f->paused)) {
+        return false;
+    }
+
+    /* If the initial delay parameter was set to -1, then never fail. This is
+     * the most common case. */
+    if (MUNIT_LIKELY(f->countdown < 0)) {
+        return false;
+    }
+
+    /* If we did not yet reach 'delay' ticks, then just decrease the countdown.
+     */
+    if (f->countdown > 0) {
+        f->countdown--;
+        return false;
+    }
+
+    munit_assert_int(f->countdown, ==, 0);
+
+    /* We reached 'delay' ticks, let's see how many times we have to trigger the
+     * fault, if any. */
+
+    if (f->n < 0) {
+        /* Trigger the fault forever. */
+        return true;
+    }
+
+    if (f->n > 0) {
+        /* Trigger the fault at least this time. */
+        f->n--;
+        return true;
+    }
+
+    munit_assert_int(f->n, ==, 0);
+
+    /* We reached 'repeat' ticks, let's stop triggering the fault. */
+    f->countdown--;
+
+    return false;
+}
+
+void FaultConfig(struct Fault *f, int delay, int repeat)
+{
+    f->countdown = delay;
+    f->n = repeat;
+}
+
+void FaultPause(struct Fault *f)
+{
+    f->paused = true;
+}
+
+void FaultResume(struct Fault *f)
+{
+    f->paused = false;
+}
diff --git a/test/raft/lib/fault.h b/test/raft/lib/fault.h
new file mode 100644
index 000000000..056469391
--- /dev/null
+++ b/test/raft/lib/fault.h
@@ -0,0 +1,32 @@
+/* Helper for test components supporting fault injection. */
+
+#ifndef TEST_FAULT_H
+#define TEST_FAULT_H
+
+#include <stdbool.h>
+
+/* Information about a fault that should occur in a component. */
+struct Fault
+{
+    int countdown; /* Trigger the fault when this counter gets to zero. */
+    int n;         /* Repeat the fault this many times. Default is -1. */
+    bool paused;   /* Pause fault triggering. */
+};
+
+/* Initialize a fault. */
+void FaultInit(struct Fault *f);
+
+/* Advance the counters of the fault. Return true if the fault should be
+ * triggered, false otherwise. */
+bool FaultTick(struct Fault *f);
+
+/* Configure the fault with the given values. */
+void FaultConfig(struct Fault *f, int delay, int repeat);
+
+/* Pause triggering configured faults. */
+void FaultPause(struct Fault *f);
+
+/* Resume triggering configured faults. */
+void FaultResume(struct Fault *f);
+
+#endif /* TESTFAULT_H */
diff --git a/test/raft/lib/fs.sh b/test/raft/lib/fs.sh
new file mode 100755
index 000000000..638eeda91
--- /dev/null
+++ b/test/raft/lib/fs.sh
@@ -0,0 +1,118 @@
+#!/bin/sh -e
+
+# Setup loopback disk devices to test the raft I/O implementation against
+# various file systems.
+
+usage() {
+    echo "usage: $0 setup|teardown [types]"
+}
+
+if [ "${#}" -lt 1 ]; then
+    usage
+    exit 1
+fi
+
+cmd="${1}"
+shift
+
+types="tmpfs"
+
+# Check if loop devices are available, we might be running inside an
+# unprivileged container
+if sudo losetup -f > /dev/null 2>&1; then
+    types="$types ext4"
+
+    if [ "$(which mkfs.btrfs)" != "" ]; then
+        types="$types btrfs"
+    fi
+
+    if [ "$(which mkfs.xfs)" != "" ]; then
+        types="$types xfs"
+    fi
+
+    if [ "$(which zfs)" != "" ]; then
+        types="$types zfs"
+    fi
+
+    if [ "${#}" -gt 0 ]; then
+        types="${@}"
+    fi
+
+fi
+
+if [ "${cmd}" = "detect" ]; then
+    vars=""
+    for type in $types; do
+        vars="${vars}RAFT_TMP_$(echo ${type} | tr [a-z] [A-Z])=./tmp/${type} "
+    done
+    echo $vars
+    exit 0
+fi
+
+if [ "${cmd}" = "setup" ]; then
+    mkdir ./tmp
+
+    for type in $types; do
+	echo -n "Creating $type loop device mount..."
+
+	# Create the fs mount point
+	mkdir "./tmp/${type}"
+
+	if [ "$type" = "tmpfs" ]; then
+	    # For tmpfs we don't need a loopback disk device.
+	    sudo mount -t tmpfs -o size=32m tmpfs ./tmp/tmpfs
+	else
+	    # Create a loopback disk device
+	    dd if=/dev/zero of="./tmp/.${type}" bs=4096 count=28672 > /dev/null 2>&1
+	    loop=$(sudo losetup -f)
+	    sudo losetup "${loop}" "./tmp/.${type}"
+
+	    # Initialize the file system
+	    if [ "$type" = "zfs" ]; then
+		sudo zpool create raft "${loop}"
+		sudo zfs create -o mountpoint=$(pwd)/tmp/zfs raft/zfs
+	    else
+		sudo mkfs.${type} "${loop}" > /dev/null 2>&1
+		sudo mount "${loop}" "./tmp/${type}"
+	    fi
+	fi
+
+	sudo chown $USER "./tmp/${type}"
+
+	echo " done"
+    done
+
+    exit 0
+fi
+
+if [ "${cmd}" = "teardown" ]; then
+
+    for type in $types; do
+	echo -n "Deleting $type loop device mount..."
+
+	sudo umount "./tmp/${type}"
+	rm -rf "./tmp/${type}"
+
+	if [ "$type" != "tmpfs" ]; then
+	    # For zfs we need to destroy the pool
+	    if [ "$type" = "zfs" ]; then
+		sudo zpool destroy raft
+	    fi
+
+	    # For regular file systems, remove the loopback disk device.
+	    loop=$(sudo losetup -a | grep ".${type}" | cut -f 1 -d :)
+	    sudo losetup -d "${loop}"
+	    rm "./tmp/.${type}"
+	fi
+
+	echo " done"
+    done
+
+    rmdir ./tmp
+    
+    exit 0
+fi
+
+usage
+
+exit 1
diff --git a/test/raft/lib/fsm.c b/test/raft/lib/fsm.c
new file mode 100644
index 000000000..78b6ff90e
--- /dev/null
+++ b/test/raft/lib/fsm.c
@@ -0,0 +1,293 @@
+#include "fsm.h"
+
+#include "../../../src/raft/byte.h"
+#include "munit.h"
+
+/* In-memory implementation of the raft_fsm interface. */
+struct fsm
+{
+    int x;
+    int y;
+    int lock;
+    void *data;
+};
+
+/* Command codes */
+enum { SET_X = 1, SET_Y, ADD_X, ADD_Y };
+
+static int fsmApply(struct raft_fsm *fsm,
+                    const struct raft_buffer *buf,
+                    void **result)
+{
+    struct fsm *f = fsm->data;
+    const void *cursor = buf->base;
+    unsigned command;
+    int value;
+
+    if (buf->len != 16) {
+        return -1;
+    }
+
+    command = (unsigned)byteGet64(&cursor);
+    value = (int)byteGet64(&cursor);
+
+    switch (command) {
+        case SET_X:
+            f->x = value;
+            break;
+        case SET_Y:
+            f->y = value;
+            break;
+        case ADD_X:
+            f->x += value;
+            break;
+        case ADD_Y:
+            f->y += value;
+            break;
+        default:
+            return -1;
+    }
+
+    *result = NULL;
+
+    return 0;
+}
+
+static int fsmRestore(struct raft_fsm *fsm, struct raft_buffer *buf)
+{
+    struct fsm *f = fsm->data;
+    const void *cursor = buf->base;
+
+    munit_assert_int(buf->len, ==, sizeof(uint64_t) * 2);
+
+    f->x = byteGet64(&cursor);
+    f->y = byteGet64(&cursor);
+
+    raft_free(buf->base);
+
+    return 0;
+}
+
+static int fsmEncodeSnapshot(int x,
+                             int y,
+                             struct raft_buffer *bufs[],
+                             unsigned *n_bufs)
+{
+    struct raft_buffer *buf;
+    void *cursor;
+
+    *n_bufs = 1;
+
+    *bufs = raft_malloc(sizeof **bufs);
+    if (*bufs == NULL) {
+        return RAFT_NOMEM;
+    }
+
+    buf = &(*bufs)[0];
+    buf->len = sizeof(uint64_t) * 2;
+    buf->base = raft_malloc(buf->len);
+    if (buf->base == NULL) {
+        return RAFT_NOMEM;
+    }
+
+    cursor = (*bufs)[0].base;
+
+    bytePut64(&cursor, x);
+    bytePut64(&cursor, y);
+
+    return 0;
+}
+
+/* For use with fsm->version 1 */
+static int fsmSnapshot_v1(struct raft_fsm *fsm,
+                          struct raft_buffer *bufs[],
+                          unsigned *n_bufs)
+{
+    struct fsm *f = fsm->data;
+    return fsmEncodeSnapshot(f->x, f->y, bufs, n_bufs);
+}
+
+/* For use with fsmSnapshotFinalize and fsm->version >= 2 */
+static int fsmSnapshot_v2(struct raft_fsm *fsm,
+                          struct raft_buffer *bufs[],
+                          unsigned *n_bufs)
+{
+    struct fsm *f = fsm->data;
+    munit_assert_int(f->lock, ==, 0);
+    f->lock = 1;
+    f->data = raft_malloc(8); /* Detect proper cleanup in finalize */
+    munit_assert_ptr_not_null(f->data);
+    return fsmEncodeSnapshot(f->x, f->y, bufs, n_bufs);
+}
+
+static int fsmSnapshotInitialize(struct raft_fsm *fsm,
+                                 struct raft_buffer *bufs[],
+                                 unsigned *n_bufs)
+{
+    (void)bufs;
+    (void)n_bufs;
+    struct fsm *f = fsm->data;
+    munit_assert_int(f->lock, ==, 0);
+    f->lock = 1;
+    munit_assert_ptr_null(f->data);
+    f->data = raft_malloc(8); /* Detect proper cleanup in finalize */
+    munit_assert_ptr_not_null(f->data);
+    return 0;
+}
+
+static int fsmSnapshotAsync(struct raft_fsm *fsm,
+                            struct raft_buffer *bufs[],
+                            unsigned *n_bufs)
+{
+    struct fsm *f = fsm->data;
+    return fsmEncodeSnapshot(f->x, f->y, bufs, n_bufs);
+}
+
+static int fsmSnapshotFinalize(struct raft_fsm *fsm,
+                               struct raft_buffer *bufs[],
+                               unsigned *n_bufs)
+{
+    (void)bufs;
+    (void)n_bufs;
+    struct fsm *f = fsm->data;
+    if (*bufs != NULL) {
+        for (unsigned i = 0; i < *n_bufs; ++i) {
+            raft_free((*bufs)[i].base);
+        }
+        raft_free(*bufs);
+    }
+    *bufs = NULL;
+    *n_bufs = 0;
+    munit_assert_int(f->lock, ==, 1);
+    f->lock = 0;
+    munit_assert_ptr_not_null(f->data);
+    raft_free(f->data);
+    f->data = NULL;
+    return 0;
+}
+
+void FsmInit(struct raft_fsm *fsm, int version)
+{
+    struct fsm *f = munit_malloc(sizeof *fsm);
+    memset(fsm, 'x', sizeof(*fsm)); /* Fill  with garbage */
+
+    f->x = 0;
+    f->y = 0;
+    f->lock = 0;
+    f->data = NULL;
+
+    fsm->version = version;
+    fsm->data = f;
+    fsm->apply = fsmApply;
+    fsm->snapshot = fsmSnapshot_v1;
+    fsm->restore = fsmRestore;
+    if (version > 1) {
+        fsm->snapshot = fsmSnapshot_v2;
+        fsm->snapshot_finalize = fsmSnapshotFinalize;
+        fsm->snapshot_async = NULL;
+    }
+}
+
+void FsmInitAsync(struct raft_fsm *fsm, int version)
+{
+    munit_assert_int(version, >, 2);
+    struct fsm *f = munit_malloc(sizeof *fsm);
+    memset(fsm, 'x', sizeof(*fsm)); /* Fill  with garbage */
+
+    f->x = 0;
+    f->y = 0;
+    f->lock = 0;
+    f->data = NULL;
+
+    fsm->version = version;
+    fsm->data = f;
+    fsm->apply = fsmApply;
+    fsm->snapshot = fsmSnapshotInitialize;
+    fsm->snapshot_async = fsmSnapshotAsync;
+    fsm->snapshot_finalize = fsmSnapshotFinalize;
+    fsm->restore = fsmRestore;
+}
+
+void FsmClose(struct raft_fsm *fsm)
+{
+    struct fsm *f = fsm->data;
+    free(f);
+}
+
+void FsmEncodeSetX(const int value, struct raft_buffer *buf)
+{
+    void *cursor;
+
+    buf->base = raft_malloc(16);
+    buf->len = 16;
+
+    munit_assert_ptr_not_null(buf->base);
+
+    cursor = buf->base;
+    bytePut64(&cursor, SET_X);
+    bytePut64(&cursor, value);
+}
+
+void FsmEncodeAddX(const int value, struct raft_buffer *buf)
+{
+    void *cursor;
+
+    buf->base = raft_malloc(16);
+    buf->len = 16;
+
+    munit_assert_ptr_not_null(buf->base);
+
+    cursor = buf->base;
+    bytePut64(&cursor, ADD_X);
+    bytePut64(&cursor, value);
+}
+
+void FsmEncodeSetY(const int value, struct raft_buffer *buf)
+{
+    void *cursor;
+
+    buf->base = raft_malloc(16);
+    buf->len = 16;
+
+    munit_assert_ptr_not_null(buf->base);
+
+    cursor = buf->base;
+    bytePut64(&cursor, SET_Y);
+    bytePut64(&cursor, value);
+}
+
+void FsmEncodeAddY(const int value, struct raft_buffer *buf)
+{
+    void *cursor;
+
+    buf->base = raft_malloc(16);
+    buf->len = 16;
+
+    munit_assert_ptr_not_null(buf->base);
+
+    cursor = buf->base;
+    bytePut64(&cursor, ADD_Y);
+    bytePut64(&cursor, value);
+}
+
+void FsmEncodeSnapshot(int x,
+                       int y,
+                       struct raft_buffer *bufs[],
+                       unsigned *n_bufs)
+{
+    int rc;
+    rc = fsmEncodeSnapshot(x, y, bufs, n_bufs);
+    munit_assert_int(rc, ==, 0);
+}
+
+int FsmGetX(struct raft_fsm *fsm)
+{
+    struct fsm *f = fsm->data;
+    return f->x;
+}
+
+int FsmGetY(struct raft_fsm *fsm)
+{
+    struct fsm *f = fsm->data;
+    return f->y;
+}
diff --git a/test/raft/lib/fsm.h b/test/raft/lib/fsm.h
new file mode 100644
index 000000000..da82fa0f8
--- /dev/null
+++ b/test/raft/lib/fsm.h
@@ -0,0 +1,39 @@
+/* Test implementation of the raft_fsm interface, with fault injection.
+ *
+ * The test FSM supports only two commands: setting x and setting y. */
+
+#ifndef TEST_FSM_H
+#define TEST_FSM_H
+
+#include "../../../src/raft.h"
+
+void FsmInit(struct raft_fsm *fsm, int version);
+
+/* Same as FsmInit but with asynchronous snapshots */
+void FsmInitAsync(struct raft_fsm *fsm, int version);
+
+void FsmClose(struct raft_fsm *fsm);
+
+/* Encode a command to set x to the given value. */
+void FsmEncodeSetX(int value, struct raft_buffer *buf);
+
+/* Encode a command to add the given value to x. */
+void FsmEncodeAddX(int value, struct raft_buffer *buf);
+
+/* Encode a command to set y to the given value. */
+void FsmEncodeSetY(int value, struct raft_buffer *buf);
+
+/* Encode a command to add the given value to y. */
+void FsmEncodeAddY(int value, struct raft_buffer *buf);
+
+/* Encode a snapshot of an FSM with the given values for x and y. */
+void FsmEncodeSnapshot(int x,
+                       int y,
+                       struct raft_buffer *bufs[],
+                       unsigned *n_bufs);
+
+/* Return the current value of x or y. */
+int FsmGetX(struct raft_fsm *fsm);
+int FsmGetY(struct raft_fsm *fsm);
+
+#endif /* TEST_FSM_H */
diff --git a/test/raft/lib/heap.c b/test/raft/lib/heap.c
new file mode 100644
index 000000000..77f187ea3
--- /dev/null
+++ b/test/raft/lib/heap.c
@@ -0,0 +1,134 @@
+#include "heap.h"
+
+#include <stdlib.h>
+
+#include "fault.h"
+#include "munit.h"
+
+struct heap
+{
+    size_t alignment;   /* Value of last aligned alloc */
+    struct Fault fault; /* Fault trigger. */
+};
+
+static void heapInit(struct heap *h)
+{
+    h->alignment = 0;
+    FaultInit(&h->fault);
+}
+
+static void *heapMalloc(void *data, size_t size)
+{
+    struct heap *h = data;
+    if (FaultTick(&h->fault)) {
+        return NULL;
+    }
+    return munit_malloc(size);
+}
+
+static void heapFree(void *data, void *ptr)
+{
+    (void)data;
+    free(ptr);
+}
+
+static void *heapCalloc(void *data, size_t nmemb, size_t size)
+{
+    struct heap *h = data;
+    if (FaultTick(&h->fault)) {
+        return NULL;
+    }
+    return munit_calloc(nmemb, size);
+}
+
+static void *heapRealloc(void *data, void *ptr, size_t size)
+{
+    struct heap *h = data;
+
+    if (FaultTick(&h->fault)) {
+        return NULL;
+    }
+
+    ptr = realloc(ptr, size);
+
+    if (size == 0) {
+        munit_assert_ptr_null(ptr);
+    } else {
+        munit_assert_ptr_not_null(ptr);
+    }
+
+    return ptr;
+}
+
+static void *heapAlignedAlloc(void *data, size_t alignment, size_t size)
+{
+    struct heap *h = data;
+    void *p;
+
+    if (FaultTick(&h->fault)) {
+        return NULL;
+    }
+
+    p = aligned_alloc(alignment, size);
+    munit_assert_ptr_not_null(p);
+
+    h->alignment = alignment;
+
+    return p;
+}
+
+static void heapAlignedFree(void *data, size_t alignment, void *ptr)
+{
+    struct heap *h = data;
+    munit_assert_ulong(alignment, ==, h->alignment);
+    heapFree(data, ptr);
+}
+
+static int getIntParam(const MunitParameter params[], const char *name)
+{
+    const char *value = munit_parameters_get(params, name);
+    return value != NULL ? atoi(value) : 0;
+}
+
+void HeapSetUp(const MunitParameter params[], struct raft_heap *h)
+{
+    struct heap *heap = munit_malloc(sizeof *heap);
+    int delay = getIntParam(params, TEST_HEAP_FAULT_DELAY);
+    int repeat = getIntParam(params, TEST_HEAP_FAULT_REPEAT);
+
+    munit_assert_ptr_not_null(h);
+
+    heapInit(heap);
+
+    FaultConfig(&heap->fault, delay, repeat);
+
+    h->data = heap;
+    h->malloc = heapMalloc;
+    h->free = heapFree;
+    h->calloc = heapCalloc;
+    h->realloc = heapRealloc;
+    h->aligned_alloc = heapAlignedAlloc;
+    h->aligned_free = heapAlignedFree;
+
+    raft_heap_set(h);
+    FaultPause(&heap->fault);
+}
+
+void HeapTearDown(struct raft_heap *h)
+{
+    struct heap *heap = h->data;
+    free(heap);
+    raft_heap_set_default();
+}
+
+void HeapFaultConfig(struct raft_heap *h, int delay, int repeat)
+{
+    struct heap *heap = h->data;
+    FaultConfig(&heap->fault, delay, repeat);
+}
+
+void HeapFaultEnable(struct raft_heap *h)
+{
+    struct heap *heap = h->data;
+    FaultResume(&heap->fault);
+}
diff --git a/test/raft/lib/heap.h b/test/raft/lib/heap.h
new file mode 100644
index 000000000..33f79f1e2
--- /dev/null
+++ b/test/raft/lib/heap.h
@@ -0,0 +1,33 @@
+/* Add support for fault injection and leak detection to stdlib's malloc()
+ * family. */
+
+#ifndef TEST_HEAP_H
+#define TEST_HEAP_H
+
+#include "../../../src/raft.h"
+#include "munit.h"
+
+/* Munit parameter defining after how many API calls the test raft_heap
+ * implementation should start failing and return errors. The default is -1,
+ * meaning that no failure will ever occur. */
+#define TEST_HEAP_FAULT_DELAY "heap-fault-delay"
+
+/* Munit parameter defining how many consecutive times API calls against the
+ * test raft_heap implementation should keep failing after they started
+ * failing. This parameter has an effect only if 'store-fail-delay' is 0 or
+ * greater. The default is 1, and -1 means "keep failing forever". */
+#define TEST_HEAP_FAULT_REPEAT "heap-fault-repeat"
+
+/* Macro helpers. */
+#define FIXTURE_HEAP struct raft_heap heap
+#define SET_UP_HEAP HeapSetUp(params, &f->heap)
+#define TEAR_DOWN_HEAP HeapTearDown(&f->heap)
+#define HEAP_FAULT_ENABLE HeapFaultEnable(&f->heap)
+
+void HeapSetUp(const MunitParameter params[], struct raft_heap *h);
+void HeapTearDown(struct raft_heap *h);
+
+void HeapFaultConfig(struct raft_heap *h, int delay, int repeat);
+void HeapFaultEnable(struct raft_heap *h);
+
+#endif /* TEST_HEAP_H */
diff --git a/test/raft/lib/loop.c b/test/raft/lib/loop.c
new file mode 100644
index 000000000..6a63161a4
--- /dev/null
+++ b/test/raft/lib/loop.c
@@ -0,0 +1,7 @@
+#include "loop.h"
+
+void test_loop_walk_cb(uv_handle_t *handle, void *arg)
+{
+    (void)arg;
+    munit_logf(MUNIT_LOG_INFO, "handle %d", handle->type);
+}
diff --git a/test/raft/lib/loop.h b/test/raft/lib/loop.h
new file mode 100644
index 000000000..03d3832fd
--- /dev/null
+++ b/test/raft/lib/loop.h
@@ -0,0 +1,115 @@
+/* Add support for using the libuv loop in tests. */
+
+#ifndef TEST_LOOP_H
+#define TEST_LOOP_H
+
+#include <uv.h>
+
+#include "../../../src/raft.h"
+#include "munit.h"
+
+/* Max n. of loop iterations ran by a single function call */
+#define LOOP_MAX_RUN 20
+
+#define FIXTURE_LOOP struct uv_loop_s loop
+
+/* Older libuv versions might try to free() memory that was not allocated. */
+#if HAVE_DECL_UV_FS_O_CREAT
+#define LOOP_REPLACE_ALLOCATOR                                         \
+    _rv = uv_replace_allocator(raft_malloc, raft_realloc, raft_calloc, \
+                               raft_free);                             \
+    munit_assert_int(_rv, ==, 0)
+#else
+#define LOOP_REPLACE_ALLOCATOR
+#endif
+
+#define SETUP_LOOP                    \
+    {                                 \
+        int _rv;                      \
+        LOOP_REPLACE_ALLOCATOR;       \
+        _rv = uv_loop_init(&f->loop); \
+        munit_assert_int(_rv, ==, 0); \
+    }
+
+#define TEAR_DOWN_LOOP                                                     \
+    {                                                                      \
+        int rv_;                                                           \
+        int alive_ = uv_loop_alive(&f->loop);                              \
+        if (alive_ != 0) {                                                 \
+            LOOP_STOP;                                                     \
+        }                                                                  \
+        rv_ = uv_loop_close(&f->loop);                                     \
+        if (rv_ != 0) {                                                    \
+            uv_walk(&f->loop, test_loop_walk_cb, NULL);                    \
+            munit_errorf("uv_loop_close: %s (%d)", uv_strerror(rv_), rv_); \
+        }                                                                  \
+        rv_ = uv_replace_allocator(malloc, realloc, calloc, free);         \
+        munit_assert_int(rv_, ==, 0);                                      \
+    }
+
+/* Run the loop until there are no pending active handles or the given amount of
+ * iterations is reached. */
+#define LOOP_RUN(N)                                                       \
+    {                                                                     \
+        unsigned i__;                                                     \
+        int rv__;                                                         \
+        for (i__ = 0; i__ < N; i__++) {                                   \
+            rv__ = uv_run(&f->loop, UV_RUN_ONCE);                         \
+            if (rv__ < 0) {                                               \
+                munit_errorf("uv_run: %s (%d)", uv_strerror(rv__), rv__); \
+            }                                                             \
+            if (rv__ == 0) {                                              \
+                break;                                                    \
+            }                                                             \
+        }                                                                 \
+    }
+
+/* Run the loop until the value stored through the given boolean pointer is
+ * true.
+ *
+ * If the loop exhausts all active handles or if #LOOP_MAX_RUN is reached, the
+ * test fails. */
+#define LOOP_RUN_UNTIL(CONDITION)                                             \
+    {                                                                         \
+        unsigned __i;                                                         \
+        int __rv;                                                             \
+        for (__i = 0; __i < LOOP_MAX_RUN; __i++) {                            \
+            if (*(CONDITION)) {                                               \
+                break;                                                        \
+            }                                                                 \
+            __rv = uv_run(&f->loop, UV_RUN_ONCE);                             \
+            if (__rv < 0) {                                                   \
+                munit_errorf("uv_run: %s (%d)", uv_strerror(__rv), __rv);     \
+            }                                                                 \
+            if (__rv == 0) {                                                  \
+                if (*(CONDITION)) {                                           \
+                    break;                                                    \
+                }                                                             \
+                munit_errorf("uv_run: stopped after %u iterations", __i + 1); \
+            }                                                                 \
+        }                                                                     \
+        if (!*(CONDITION)) {                                                  \
+            munit_errorf("uv_run: condition not met in %d iterations",        \
+                         LOOP_MAX_RUN);                                       \
+        }                                                                     \
+    }
+
+/* Run the loop until there are no pending active handles.
+ *
+ * If there are still pending active handles after LOOP_MAX_RUN iterations, the
+ * test will fail.
+ *
+ * This is meant to be used in tear down functions. */
+#define LOOP_STOP                                                 \
+    {                                                             \
+        int alive__;                                              \
+        LOOP_RUN(LOOP_MAX_RUN);                                   \
+        alive__ = uv_loop_alive(&f->loop);                        \
+        if (alive__ != 0) {                                       \
+            munit_error("loop has still pending active handles"); \
+        }                                                         \
+    }
+
+void test_loop_walk_cb(uv_handle_t *handle, void *arg);
+
+#endif /* TEST_LOOP_H */
diff --git a/test/raft/lib/macros.h b/test/raft/lib/macros.h
new file mode 100644
index 000000000..9af9bd024
--- /dev/null
+++ b/test/raft/lib/macros.h
@@ -0,0 +1,13 @@
+/**
+ * Miscellaneous test macros.
+ */
+
+#ifndef TEST_MACROS_H_
+#define TEST_MACROS_H_
+
+#define GET_2ND_ARG(arg1, arg2, ...) arg2
+#define GET_3RD_ARG(arg1, arg2, arg3, ...) arg3
+#define GET_4TH_ARG(arg1, arg2, arg3, arg4, ...) arg4
+#define GET_5TH_ARG(arg1, arg2, arg3, arg4, arg5, ...) arg5
+
+#endif /* TEST_MACROS_H_ */
diff --git a/test/raft/lib/munit.c b/test/raft/lib/munit.c
new file mode 100644
index 000000000..1d496f4b5
--- /dev/null
+++ b/test/raft/lib/munit.c
@@ -0,0 +1,2077 @@
+/* Copyright (c) 2013-2018 Evan Nemerson <evan@nemerson.com>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*** Configuration ***/
+
+/* This is just where the output from the test goes.  It's really just
+ * meant to let you choose stdout or stderr, but if anyone really want
+ * to direct it to a file let me know, it would be fairly easy to
+ * support. */
+#if !defined(MUNIT_OUTPUT_FILE)
+#  define MUNIT_OUTPUT_FILE stdout
+#endif
+
+/* This is a bit more useful; it tells µnit how to format the seconds in
+ * timed tests.  If your tests run for longer you might want to reduce
+ * it, and if your computer is really fast and your tests are tiny you
+ * can increase it. */
+#if !defined(MUNIT_TEST_TIME_FORMAT)
+#  define MUNIT_TEST_TIME_FORMAT "0.8f"
+#endif
+
+/* If you have long test names you might want to consider bumping
+ * this.  The result information takes 43 characters. */
+#if !defined(MUNIT_TEST_NAME_LEN)
+#  define MUNIT_TEST_NAME_LEN 37
+#endif
+
+/* If you don't like the timing information, you can disable it by
+ * defining MUNIT_DISABLE_TIMING. */
+#if !defined(MUNIT_DISABLE_TIMING)
+#  define MUNIT_ENABLE_TIMING
+#endif
+
+/*** End configuration ***/
+
+#if defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE < 200809L)
+#  undef _POSIX_C_SOURCE
+#endif
+#if !defined(_POSIX_C_SOURCE)
+#  define _POSIX_C_SOURCE 200809L
+#endif
+
+/* Solaris freaks out if you try to use a POSIX or SUS standard without
+ * the "right" C standard. */
+#if defined(_XOPEN_SOURCE)
+#  undef _XOPEN_SOURCE
+#endif
+
+#if defined(__STDC_VERSION__)
+#  if __STDC_VERSION__ >= 201112L
+#    define _XOPEN_SOURCE 700
+#  elif __STDC_VERSION__ >= 199901L
+#    define _XOPEN_SOURCE 600
+#  endif
+#endif
+
+/* Because, according to Microsoft, POSIX is deprecated.  You've got
+ * to appreciate the chutzpah. */
+#if defined(_MSC_VER) && !defined(_CRT_NONSTDC_NO_DEPRECATE)
+#  define _CRT_NONSTDC_NO_DEPRECATE
+#endif
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+#  include <stdbool.h>
+#elif defined(_WIN32)
+/* https://msdn.microsoft.com/en-us/library/tf4dy80a.aspx */
+#endif
+
+#include <limits.h>
+#include <time.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <setjmp.h>
+
+#if !defined(MUNIT_NO_NL_LANGINFO) && !defined(_WIN32)
+#define MUNIT_NL_LANGINFO
+#include <locale.h>
+#include <langinfo.h>
+#include <strings.h>
+#endif
+
+#if !defined(_WIN32)
+#  include <unistd.h>
+#  include <sys/types.h>
+#  include <sys/wait.h>
+#else
+#  include <windows.h>
+#  include <io.h>
+#  include <fcntl.h>
+#  if !defined(STDERR_FILENO)
+#    define STDERR_FILENO _fileno(stderr)
+#  endif
+#endif
+
+#include "munit.h"
+
+#define MUNIT_STRINGIFY(x) #x
+#define MUNIT_XSTRINGIFY(x) MUNIT_STRINGIFY(x)
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_CC) || defined(__IBMCPP__)
+#  define MUNIT_THREAD_LOCAL __thread
+#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201102L)) || defined(_Thread_local)
+#  define MUNIT_THREAD_LOCAL _Thread_local
+#elif defined(_WIN32)
+#  define MUNIT_THREAD_LOCAL __declspec(thread)
+#endif
+
+/* MSVC 12.0 will emit a warning at /W4 for code like 'do { ... }
+ * while (0)', or 'do { ... } while (true)'.  I'm pretty sure nobody
+ * at Microsoft compiles with /W4. */
+#if defined(_MSC_VER) && (_MSC_VER <= 1800)
+#pragma warning(disable: 4127)
+#endif
+
+#if defined(_WIN32) || defined(__EMSCRIPTEN__)
+#  define MUNIT_NO_FORK
+#endif
+
+#if defined(__EMSCRIPTEN__)
+#  define MUNIT_NO_BUFFER
+#endif
+
+/*** Logging ***/
+
+static MunitLogLevel munit_log_level_visible = MUNIT_LOG_INFO;
+static MunitLogLevel munit_log_level_fatal = MUNIT_LOG_ERROR;
+
+#if defined(MUNIT_THREAD_LOCAL)
+static MUNIT_THREAD_LOCAL bool munit_error_jmp_buf_valid = false;
+static MUNIT_THREAD_LOCAL jmp_buf munit_error_jmp_buf;
+#endif
+
+#if defined(MUNIT_THREAD_LOCAL) && defined(MUNIT_ALWAYS_TEAR_DOWN)
+static MUNIT_THREAD_LOCAL bool munit_tear_down_jmp_buf_valid = false;
+static MUNIT_THREAD_LOCAL jmp_buf munit_tear_down_jmp_buf;
+#endif
+
+/* At certain warning levels, mingw will trigger warnings about
+ * suggesting the format attribute, which we've explicitly *not* set
+ * because it will then choke on our attempts to use the MS-specific
+ * I64 modifier for size_t (which we have to use since MSVC doesn't
+ * support the C99 z modifier). */
+
+#if defined(__MINGW32__) || defined(__MINGW64__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
+#endif
+
+MUNIT_PRINTF(5,0)
+static void
+munit_logf_exv(MunitLogLevel level, FILE* fp, const char* filename, int line, const char* format, va_list ap) {
+  if (level < munit_log_level_visible)
+    return;
+
+  switch (level) {
+    case MUNIT_LOG_DEBUG:
+      fputs("Debug", fp);
+      break;
+    case MUNIT_LOG_INFO:
+      fputs("Info", fp);
+      break;
+    case MUNIT_LOG_WARNING:
+      fputs("Warning", fp);
+      break;
+    case MUNIT_LOG_ERROR:
+      fputs("Error", fp);
+      break;
+    default:
+      munit_logf_ex(MUNIT_LOG_ERROR, filename, line, "Invalid log level (%d)", level);
+      return;
+  }
+
+  fputs(": ", fp);
+  if (filename != NULL)
+    fprintf(fp, "%s:%d: ", filename, line);
+  vfprintf(fp, format, ap);
+  fputc('\n', fp);
+}
+
+MUNIT_PRINTF(3,4)
+static void
+munit_logf_internal(MunitLogLevel level, FILE* fp, const char* format, ...) {
+  va_list ap;
+
+  va_start(ap, format);
+  munit_logf_exv(level, fp, NULL, 0, format, ap);
+  va_end(ap);
+}
+
+static void
+munit_log_internal(MunitLogLevel level, FILE* fp, const char* message) {
+  munit_logf_internal(level, fp, "%s", message);
+}
+
+void
+munit_logf_ex(MunitLogLevel level, const char* filename, int line, const char* format, ...) {
+  va_list ap;
+
+  va_start(ap, format);
+  munit_logf_exv(level, stderr, filename, line, format, ap);
+  va_end(ap);
+
+  if (level >= munit_log_level_fatal) {
+#if defined(MUNIT_THREAD_LOCAL)
+    if (munit_error_jmp_buf_valid)
+      longjmp(munit_error_jmp_buf, 1);
+#endif
+    abort();
+  }
+}
+
+void
+munit_errorf_ex(const char* filename, int line, const char* format, ...) {
+  va_list ap;
+
+  va_start(ap, format);
+  munit_logf_exv(MUNIT_LOG_ERROR, stderr, filename, line, format, ap);
+  va_end(ap);
+
+#if defined(MUNIT_THREAD_LOCAL) && defined(MUNIT_ALWAYS_TEAR_DOWN)
+  if (munit_tear_down_jmp_buf_valid)
+    longjmp(munit_tear_down_jmp_buf, 1);
+#endif
+
+#if defined(MUNIT_THREAD_LOCAL)
+  if (munit_error_jmp_buf_valid)
+    longjmp(munit_error_jmp_buf, 1);
+#endif
+  abort();
+}
+
+#if defined(__MINGW32__) || defined(__MINGW64__)
+#pragma GCC diagnostic pop
+#endif
+
+#if !defined(MUNIT_STRERROR_LEN)
+#  define MUNIT_STRERROR_LEN 80
+#endif
+
+static void
+munit_log_errno(MunitLogLevel level, FILE* fp, const char* msg) {
+#if defined(MUNIT_NO_STRERROR_R) || (defined(__MINGW32__) && !defined(MINGW_HAS_SECURE_API))
+  munit_logf_internal(level, fp, "%s: %s (%d)", msg, strerror(errno), errno);
+#else
+  char munit_error_str[MUNIT_STRERROR_LEN];
+  munit_error_str[0] = '\0';
+
+#if !defined(_WIN32)
+  strerror_r(errno, munit_error_str, MUNIT_STRERROR_LEN);
+#else
+  strerror_s(munit_error_str, MUNIT_STRERROR_LEN, errno);
+#endif
+
+  munit_logf_internal(level, fp, "%s: %s (%d)", msg, munit_error_str, errno);
+#endif
+}
+
+/*** Memory allocation ***/
+
+void*
+munit_malloc_ex(const char* filename, int line, size_t size) {
+  void* ptr;
+
+  if (size == 0)
+    return NULL;
+
+  ptr = calloc(1, size);
+  if (MUNIT_UNLIKELY(ptr == NULL)) {
+    munit_logf_ex(MUNIT_LOG_ERROR, filename, line, "Failed to allocate %" MUNIT_SIZE_MODIFIER "u bytes.", size);
+  }
+
+  return ptr;
+}
+
+/*** Timer code ***/
+
+#if defined(MUNIT_ENABLE_TIMING)
+
+#define psnip_uint64_t munit_uint64_t
+#define psnip_uint32_t munit_uint32_t
+
+/* Code copied from portable-snippets
+ * <https://github.com/nemequ/portable-snippets/>.  If you need to
+ * change something, please do it there so we can keep the code in
+ * sync. */
+
+/* Clocks (v1)
+ * Portable Snippets - https://github.com/nemequ/portable-snippets
+ * Created by Evan Nemerson <evan@nemerson.com>
+ *
+ *   To the extent possible under law, the authors have waived all
+ *   copyright and related or neighboring rights to this code.  For
+ *   details, see the Creative Commons Zero 1.0 Universal license at
+ *   https://creativecommons.org/publicdomain/zero/1.0/
+ */
+
+#if !defined(PSNIP_CLOCK_H)
+#define PSNIP_CLOCK_H
+
+#if !defined(psnip_uint64_t)
+#  include "../exact-int/exact-int.h"
+#endif
+
+#if !defined(PSNIP_CLOCK_STATIC_INLINE)
+#  if defined(__GNUC__)
+#    define PSNIP_CLOCK__COMPILER_ATTRIBUTES __attribute__((__unused__))
+#  else
+#    define PSNIP_CLOCK__COMPILER_ATTRIBUTES
+#  endif
+
+#  define PSNIP_CLOCK__FUNCTION PSNIP_CLOCK__COMPILER_ATTRIBUTES static
+#endif
+
+enum PsnipClockType {
+  /* This clock provides the current time, in units since 1970-01-01
+   * 00:00:00 UTC not including leap seconds.  In other words, UNIX
+   * time.  Keep in mind that this clock doesn't account for leap
+   * seconds, and can go backwards (think NTP adjustments). */
+  PSNIP_CLOCK_TYPE_WALL = 1,
+  /* The CPU time is a clock which increases only when the current
+   * process is active (i.e., it doesn't increment while blocking on
+   * I/O). */
+  PSNIP_CLOCK_TYPE_CPU = 2,
+  /* Monotonic time is always running (unlike CPU time), but it only
+     ever moves forward unless you reboot the system.  Things like NTP
+     adjustments have no effect on this clock. */
+  PSNIP_CLOCK_TYPE_MONOTONIC = 3
+};
+
+struct PsnipClockTimespec {
+  psnip_uint64_t seconds;
+  psnip_uint64_t nanoseconds;
+};
+
+/* Methods we support: */
+
+#define PSNIP_CLOCK_METHOD_CLOCK_GETTIME                   1
+#define PSNIP_CLOCK_METHOD_TIME                            2
+#define PSNIP_CLOCK_METHOD_GETTIMEOFDAY                    3
+#define PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER         4
+#define PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME              5
+#define PSNIP_CLOCK_METHOD_CLOCK                           6
+#define PSNIP_CLOCK_METHOD_GETPROCESSTIMES                 7
+#define PSNIP_CLOCK_METHOD_GETRUSAGE                       8
+#define PSNIP_CLOCK_METHOD_GETSYSTEMTIMEPRECISEASFILETIME  9
+#define PSNIP_CLOCK_METHOD_GETTICKCOUNT64                 10
+
+#include <assert.h>
+
+#if defined(HEDLEY_UNREACHABLE)
+#  define PSNIP_CLOCK_UNREACHABLE() HEDLEY_UNREACHABLE()
+#else
+#  define PSNIP_CLOCK_UNREACHABLE() assert(0)
+#endif
+
+/* Choose an implementation */
+
+/* #undef PSNIP_CLOCK_WALL_METHOD */
+/* #undef PSNIP_CLOCK_CPU_METHOD */
+/* #undef PSNIP_CLOCK_MONOTONIC_METHOD */
+
+/* We want to be able to detect the libc implementation, so we include
+   <limits.h> (<features.h> isn't available everywhere). */
+
+#if defined(__unix__) || defined(__unix) || defined(__linux__)
+#  include <limits.h>
+#  include <unistd.h>
+#endif
+
+#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
+/* These are known to work without librt.  If you know of others
+ * please let us know so we can add them. */
+#  if \
+  (defined(__GLIBC__) && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 17))) || \
+  (defined(__FreeBSD__))
+#    define PSNIP_CLOCK_HAVE_CLOCK_GETTIME
+#  elif !defined(PSNIP_CLOCK_NO_LIBRT)
+#    define PSNIP_CLOCK_HAVE_CLOCK_GETTIME
+#  endif
+#endif
+
+#if defined(_WIN32)
+#  if !defined(PSNIP_CLOCK_CPU_METHOD)
+#    define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_GETPROCESSTIMES
+#  endif
+#  if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
+#    define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER
+#  endif
+#endif
+
+#if defined(__MACH__) && !defined(__gnu_hurd__)
+#  if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
+#    define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME
+#  endif
+#endif
+
+#if defined(PSNIP_CLOCK_HAVE_CLOCK_GETTIME)
+#  include <time.h>
+#  if !defined(PSNIP_CLOCK_WALL_METHOD)
+#    if defined(CLOCK_REALTIME_PRECISE)
+#      define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#      define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME_PRECISE
+#    elif !defined(__sun)
+#      define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#      define PSNIP_CLOCK_CLOCK_GETTIME_WALL CLOCK_REALTIME
+#    endif
+#  endif
+#  if !defined(PSNIP_CLOCK_CPU_METHOD)
+#    if defined(_POSIX_CPUTIME) || defined(CLOCK_PROCESS_CPUTIME_ID)
+#      define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#      define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_PROCESS_CPUTIME_ID
+#    elif defined(CLOCK_VIRTUAL)
+#      define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#      define PSNIP_CLOCK_CLOCK_GETTIME_CPU CLOCK_VIRTUAL
+#    endif
+#  endif
+#  if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
+#    if defined(CLOCK_MONOTONIC_RAW)
+#      define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#      define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC
+#    elif defined(CLOCK_MONOTONIC_PRECISE)
+#      define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#      define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC_PRECISE
+#    elif defined(_POSIX_MONOTONIC_CLOCK) || defined(CLOCK_MONOTONIC)
+#      define PSNIP_CLOCK_MONOTONIC_METHOD PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+#      define PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC CLOCK_MONOTONIC
+#    endif
+#  endif
+#endif
+
+#if defined(_POSIX_VERSION) && (_POSIX_VERSION >= 200112L)
+#  if !defined(PSNIP_CLOCK_WALL_METHOD)
+#    define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_GETTIMEOFDAY
+#  endif
+#endif
+
+#if !defined(PSNIP_CLOCK_WALL_METHOD)
+#  define PSNIP_CLOCK_WALL_METHOD PSNIP_CLOCK_METHOD_TIME
+#endif
+
+#if !defined(PSNIP_CLOCK_CPU_METHOD)
+#  define PSNIP_CLOCK_CPU_METHOD PSNIP_CLOCK_METHOD_CLOCK
+#endif
+
+/* Primarily here for testing. */
+#if !defined(PSNIP_CLOCK_MONOTONIC_METHOD) && defined(PSNIP_CLOCK_REQUIRE_MONOTONIC)
+#  error No monotonic clock found.
+#endif
+
+/* Implementations */
+
+#if \
+  (defined(PSNIP_CLOCK_CPU_METHOD)       && (PSNIP_CLOCK_CPU_METHOD       == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \
+  (defined(PSNIP_CLOCK_WALL_METHOD)      && (PSNIP_CLOCK_WALL_METHOD      == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \
+  (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \
+  (defined(PSNIP_CLOCK_CPU_METHOD)       && (PSNIP_CLOCK_CPU_METHOD       == PSNIP_CLOCK_METHOD_CLOCK)) || \
+  (defined(PSNIP_CLOCK_WALL_METHOD)      && (PSNIP_CLOCK_WALL_METHOD      == PSNIP_CLOCK_METHOD_CLOCK)) || \
+  (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK)) || \
+  (defined(PSNIP_CLOCK_CPU_METHOD)       && (PSNIP_CLOCK_CPU_METHOD       == PSNIP_CLOCK_METHOD_TIME)) || \
+  (defined(PSNIP_CLOCK_WALL_METHOD)      && (PSNIP_CLOCK_WALL_METHOD      == PSNIP_CLOCK_METHOD_TIME)) || \
+  (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_TIME))
+#  include <time.h>
+#endif
+
+#if \
+  (defined(PSNIP_CLOCK_CPU_METHOD)       && (PSNIP_CLOCK_CPU_METHOD       == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) || \
+  (defined(PSNIP_CLOCK_WALL_METHOD)      && (PSNIP_CLOCK_WALL_METHOD      == PSNIP_CLOCK_METHOD_GETTIMEOFDAY)) || \
+  (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY))
+#  include <sys/time.h>
+#endif
+
+#if \
+  (defined(PSNIP_CLOCK_CPU_METHOD)       && (PSNIP_CLOCK_CPU_METHOD       == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \
+  (defined(PSNIP_CLOCK_WALL_METHOD)      && (PSNIP_CLOCK_WALL_METHOD      == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \
+  (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES)) || \
+  (defined(PSNIP_CLOCK_CPU_METHOD)       && (PSNIP_CLOCK_CPU_METHOD       == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) || \
+  (defined(PSNIP_CLOCK_WALL_METHOD)      && (PSNIP_CLOCK_WALL_METHOD      == PSNIP_CLOCK_METHOD_GETTICKCOUNT64)) || \
+  (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64))
+#  include <windows.h>
+#endif
+
+#if \
+  (defined(PSNIP_CLOCK_CPU_METHOD)       && (PSNIP_CLOCK_CPU_METHOD       == PSNIP_CLOCK_METHOD_GETRUSAGE)) || \
+  (defined(PSNIP_CLOCK_WALL_METHOD)      && (PSNIP_CLOCK_WALL_METHOD      == PSNIP_CLOCK_METHOD_GETRUSAGE)) || \
+  (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE))
+#  include <sys/time.h>
+#  include <sys/resource.h>
+#endif
+
+#if \
+  (defined(PSNIP_CLOCK_CPU_METHOD)       && (PSNIP_CLOCK_CPU_METHOD       == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \
+  (defined(PSNIP_CLOCK_WALL_METHOD)      && (PSNIP_CLOCK_WALL_METHOD      == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME)) || \
+  (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME))
+#  include <CoreServices/CoreServices.h>
+#  include <mach/mach.h>
+#  include <mach/mach_time.h>
+#endif
+
+/*** Implementations ***/
+
+#define PSNIP_CLOCK_NSEC_PER_SEC ((psnip_uint32_t) (1000000000ULL))
+
+#if \
+  (defined(PSNIP_CLOCK_CPU_METHOD)       && (PSNIP_CLOCK_CPU_METHOD       == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \
+  (defined(PSNIP_CLOCK_WALL_METHOD)      && (PSNIP_CLOCK_WALL_METHOD      == PSNIP_CLOCK_METHOD_CLOCK_GETTIME)) || \
+  (defined(PSNIP_CLOCK_MONOTONIC_METHOD) && (PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME))
+PSNIP_CLOCK__FUNCTION psnip_uint32_t
+psnip_clock__clock_getres (clockid_t clk_id) {
+  struct timespec res;
+  int r;
+
+  r = clock_getres(clk_id, &res);
+  if (r != 0)
+    return 0;
+
+  return (psnip_uint32_t) (PSNIP_CLOCK_NSEC_PER_SEC / res.tv_nsec);
+}
+
+PSNIP_CLOCK__FUNCTION int
+psnip_clock__clock_gettime (clockid_t clk_id, struct PsnipClockTimespec* res) {
+  struct timespec ts;
+
+  if (clock_gettime(clk_id, &ts) != 0)
+    return -10;
+
+  res->seconds = (psnip_uint64_t) (ts.tv_sec);
+  res->nanoseconds = (psnip_uint64_t) (ts.tv_nsec);
+
+  return 0;
+}
+#endif
+
+PSNIP_CLOCK__FUNCTION psnip_uint32_t
+psnip_clock_wall_get_precision (void) {
+#if !defined(PSNIP_CLOCK_WALL_METHOD)
+  return 0;
+#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+  return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_WALL);
+#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY
+  return 1000000;
+#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+PSNIP_CLOCK__FUNCTION int
+psnip_clock_wall_get_time (struct PsnipClockTimespec* res) {
+  (void) res;
+
+#if !defined(PSNIP_CLOCK_WALL_METHOD)
+  return -2;
+#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+  return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_WALL, res);
+#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_TIME
+  res->seconds = time(NULL);
+  res->nanoseconds = 0;
+#elif defined(PSNIP_CLOCK_WALL_METHOD) && PSNIP_CLOCK_WALL_METHOD == PSNIP_CLOCK_METHOD_GETTIMEOFDAY
+  struct timeval tv;
+
+  if (gettimeofday(&tv, NULL) != 0)
+    return -6;
+
+  res->seconds = tv.tv_sec;
+  res->nanoseconds = tv.tv_usec * 1000;
+#else
+  return -2;
+#endif
+
+  return 0;
+}
+
+PSNIP_CLOCK__FUNCTION psnip_uint32_t
+psnip_clock_cpu_get_precision (void) {
+#if !defined(PSNIP_CLOCK_CPU_METHOD)
+  return 0;
+#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+  return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_CPU);
+#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK
+  return CLOCKS_PER_SEC;
+#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES
+  return PSNIP_CLOCK_NSEC_PER_SEC / 100;
+#else
+  return 0;
+#endif
+}
+
+PSNIP_CLOCK__FUNCTION int
+psnip_clock_cpu_get_time (struct PsnipClockTimespec* res) {
+#if !defined(PSNIP_CLOCK_CPU_METHOD)
+  (void) res;
+  return -2;
+#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+  return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_CPU, res);
+#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_CLOCK
+  clock_t t = clock();
+  if (t == ((clock_t) -1))
+    return -5;
+  res->seconds = t / CLOCKS_PER_SEC;
+  res->nanoseconds = (t % CLOCKS_PER_SEC) * (PSNIP_CLOCK_NSEC_PER_SEC / CLOCKS_PER_SEC);
+#elif defined(PSNIP_CLOCK_CPU_METHOD) && PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETPROCESSTIMES
+  FILETIME CreationTime, ExitTime, KernelTime, UserTime;
+  LARGE_INTEGER date, adjust;
+
+  if (!GetProcessTimes(GetCurrentProcess(), &CreationTime, &ExitTime, &KernelTime, &UserTime))
+    return -7;
+
+  /* http://www.frenk.com/2009/12/convert-filetime-to-unix-timestamp/ */
+  date.HighPart = UserTime.dwHighDateTime;
+  date.LowPart = UserTime.dwLowDateTime;
+  adjust.QuadPart = 11644473600000 * 10000;
+  date.QuadPart -= adjust.QuadPart;
+
+  res->seconds = date.QuadPart / 10000000;
+  res->nanoseconds = (date.QuadPart % 10000000) * (PSNIP_CLOCK_NSEC_PER_SEC / 100);
+#elif PSNIP_CLOCK_CPU_METHOD == PSNIP_CLOCK_METHOD_GETRUSAGE
+  struct rusage usage;
+  if (getrusage(RUSAGE_SELF, &usage) != 0)
+    return -8;
+
+  res->seconds = usage.ru_utime.tv_sec;
+  res->nanoseconds = tv.tv_usec * 1000;
+#else
+  (void) res;
+  return -2;
+#endif
+
+  return 0;
+}
+
+PSNIP_CLOCK__FUNCTION psnip_uint32_t
+psnip_clock_monotonic_get_precision (void) {
+#if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
+  return 0;
+#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+  return psnip_clock__clock_getres(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC);
+#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME
+  static mach_timebase_info_data_t tbi = { 0, };
+  if (tbi.denom == 0)
+    mach_timebase_info(&tbi);
+  return (psnip_uint32_t) (tbi.numer / tbi.denom);
+#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64
+  return 1000;
+#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER
+  LARGE_INTEGER Frequency;
+  QueryPerformanceFrequency(&Frequency);
+  return (psnip_uint32_t) ((Frequency.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC) ? PSNIP_CLOCK_NSEC_PER_SEC : Frequency.QuadPart);
+#else
+  return 0;
+#endif
+}
+
+PSNIP_CLOCK__FUNCTION int
+psnip_clock_monotonic_get_time (struct PsnipClockTimespec* res) {
+#if !defined(PSNIP_CLOCK_MONOTONIC_METHOD)
+  (void) res;
+  return -2;
+#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_CLOCK_GETTIME
+  return psnip_clock__clock_gettime(PSNIP_CLOCK_CLOCK_GETTIME_MONOTONIC, res);
+#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_MACH_ABSOLUTE_TIME
+  psnip_uint64_t nsec = mach_absolute_time();
+  static mach_timebase_info_data_t tbi = { 0, };
+  if (tbi.denom == 0)
+    mach_timebase_info(&tbi);
+  nsec *= ((psnip_uint64_t) tbi.numer) / ((psnip_uint64_t) tbi.denom);
+  res->seconds = nsec / PSNIP_CLOCK_NSEC_PER_SEC;
+  res->nanoseconds = nsec % PSNIP_CLOCK_NSEC_PER_SEC;
+#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_QUERYPERFORMANCECOUNTER
+  LARGE_INTEGER t, f;
+  if (QueryPerformanceCounter(&t) == 0)
+    return -12;
+
+  QueryPerformanceFrequency(&f);
+  res->seconds = t.QuadPart / f.QuadPart;
+  res->nanoseconds = t.QuadPart % f.QuadPart;
+  if (f.QuadPart > PSNIP_CLOCK_NSEC_PER_SEC)
+    res->nanoseconds /= f.QuadPart / PSNIP_CLOCK_NSEC_PER_SEC;
+  else
+    res->nanoseconds *= PSNIP_CLOCK_NSEC_PER_SEC / f.QuadPart;
+#elif defined(PSNIP_CLOCK_MONOTONIC_METHOD) && PSNIP_CLOCK_MONOTONIC_METHOD == PSNIP_CLOCK_METHOD_GETTICKCOUNT64
+  const ULONGLONG msec = GetTickCount64();
+  res->seconds = msec / 1000;
+  res->nanoseconds = sec % 1000;
+#else
+  return -2;
+#endif
+
+  return 0;
+}
+
+/* Returns the number of ticks per second for the specified clock.
+ * For example, a clock with millisecond precision would return 1000,
+ * and a clock with 1 second (such as the time() function) would
+ * return 1.
+ *
+ * If the requested clock isn't available, it will return 0.
+ * Hopefully this will be rare, but if it happens to you please let us
+ * know so we can work on finding a way to support your system.
+ *
+ * Note that different clocks on the same system often have a
+ * different precisions.
+ */
+PSNIP_CLOCK__FUNCTION psnip_uint32_t
+psnip_clock_get_precision (enum PsnipClockType clock_type) {
+  switch (clock_type) {
+    case PSNIP_CLOCK_TYPE_MONOTONIC:
+      return psnip_clock_monotonic_get_precision ();
+    case PSNIP_CLOCK_TYPE_CPU:
+      return psnip_clock_cpu_get_precision ();
+    case PSNIP_CLOCK_TYPE_WALL:
+      return psnip_clock_wall_get_precision ();
+  }
+
+  PSNIP_CLOCK_UNREACHABLE();
+  return 0;
+}
+
+/* Set the provided timespec to the requested time.  Returns 0 on
+ * success, or a negative value on failure. */
+PSNIP_CLOCK__FUNCTION int
+psnip_clock_get_time (enum PsnipClockType clock_type, struct PsnipClockTimespec* res) {
+  assert(res != NULL);
+
+  switch (clock_type) {
+    case PSNIP_CLOCK_TYPE_MONOTONIC:
+      return psnip_clock_monotonic_get_time (res);
+    case PSNIP_CLOCK_TYPE_CPU:
+      return psnip_clock_cpu_get_time (res);
+    case PSNIP_CLOCK_TYPE_WALL:
+      return psnip_clock_wall_get_time (res);
+  }
+
+  return -1;
+}
+
+#endif /* !defined(PSNIP_CLOCK_H) */
+
+static psnip_uint64_t
+munit_clock_get_elapsed(struct PsnipClockTimespec* start, struct PsnipClockTimespec* end) {
+  psnip_uint64_t r = (end->seconds - start->seconds) * PSNIP_CLOCK_NSEC_PER_SEC;
+  if (end->nanoseconds < start->nanoseconds) {
+    r -= (start->nanoseconds - end->nanoseconds);
+  } else {
+    r += (end->nanoseconds - start->nanoseconds);
+  }
+  return r;
+}
+
+#else
+#  include <time.h>
+#endif /* defined(MUNIT_ENABLE_TIMING) */
+
+/*** PRNG stuff ***/
+
+/* This is (unless I screwed up, which is entirely possible) the
+ * version of PCG with 32-bit state.  It was chosen because it has a
+ * small enough state that we should reliably be able to use CAS
+ * instead of requiring a lock for thread-safety.
+ *
+ * If I did screw up, I probably will not bother changing it unless
+ * there is a significant bias.  It's really not important this be
+ * particularly strong, as long as it is fairly random it's much more
+ * important that it be reproducible, so bug reports have a better
+ * chance of being reproducible. */
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) && !defined(__EMSCRIPTEN__) && (!defined(__GNUC_MINOR__) || (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ > 8))
+#  define HAVE_STDATOMIC
+#elif defined(__clang__)
+#  if __has_extension(c_atomic)
+#    define HAVE_CLANG_ATOMICS
+#  endif
+#endif
+
+/* Workaround for http://llvm.org/bugs/show_bug.cgi?id=26911 */
+#if defined(__clang__) && defined(_WIN32)
+#  undef HAVE_STDATOMIC
+#  if defined(__c2__)
+#    undef HAVE_CLANG_ATOMICS
+#  endif
+#endif
+
+#if defined(_OPENMP)
+#  define ATOMIC_UINT32_T uint32_t
+#  define ATOMIC_UINT32_INIT(x) (x)
+#elif defined(HAVE_STDATOMIC)
+#  include <stdatomic.h>
+#  define ATOMIC_UINT32_T _Atomic uint32_t
+#  define ATOMIC_UINT32_INIT(x) ATOMIC_VAR_INIT(x)
+#elif defined(HAVE_CLANG_ATOMICS)
+#  define ATOMIC_UINT32_T _Atomic uint32_t
+#  define ATOMIC_UINT32_INIT(x) (x)
+#elif defined(_WIN32)
+#  define ATOMIC_UINT32_T volatile LONG
+#  define ATOMIC_UINT32_INIT(x) (x)
+#else
+#  define ATOMIC_UINT32_T volatile uint32_t
+#  define ATOMIC_UINT32_INIT(x) (x)
+#endif
+
+static ATOMIC_UINT32_T munit_rand_state = ATOMIC_UINT32_INIT(42);
+
+#if defined(_OPENMP)
+static inline void
+munit_atomic_store(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T value) {
+#pragma omp critical (munit_atomics)
+  *dest = value;
+}
+
+static inline uint32_t
+munit_atomic_load(ATOMIC_UINT32_T* src) {
+  int ret;
+#pragma omp critical (munit_atomics)
+  ret = *src;
+  return ret;
+}
+
+static inline uint32_t
+munit_atomic_cas(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T* expected, ATOMIC_UINT32_T desired) {
+  bool ret;
+
+#pragma omp critical (munit_atomics)
+  {
+    if (*dest == *expected) {
+      *dest = desired;
+      ret = true;
+    } else {
+      ret = false;
+    }
+  }
+
+  return ret;
+}
+#elif defined(HAVE_STDATOMIC)
+#  define munit_atomic_store(dest, value)         atomic_store(dest, value)
+#  define munit_atomic_load(src)                  atomic_load(src)
+#  define munit_atomic_cas(dest, expected, value) atomic_compare_exchange_weak(dest, expected, value)
+#elif defined(HAVE_CLANG_ATOMICS)
+#  define munit_atomic_store(dest, value)         __c11_atomic_store(dest, value, __ATOMIC_SEQ_CST)
+#  define munit_atomic_load(src)                  __c11_atomic_load(src, __ATOMIC_SEQ_CST)
+#  define munit_atomic_cas(dest, expected, value) __c11_atomic_compare_exchange_weak(dest, expected, value, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#elif defined(__GNUC__) && (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
+#  define munit_atomic_store(dest, value)         __atomic_store_n(dest, value, __ATOMIC_SEQ_CST)
+#  define munit_atomic_load(src)                  __atomic_load_n(src, __ATOMIC_SEQ_CST)
+#  define munit_atomic_cas(dest, expected, value) __atomic_compare_exchange_n(dest, expected, value, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#elif defined(__GNUC__) && (__GNUC__ >= 4)
+#  define munit_atomic_store(dest,value)          do { *(dest) = (value); } while (0)
+#  define munit_atomic_load(src)                  (*(src))
+#  define munit_atomic_cas(dest, expected, value) __sync_bool_compare_and_swap(dest, *expected, value)
+#elif defined(_WIN32) /* Untested */
+#  define munit_atomic_store(dest,value)          do { *(dest) = (value); } while (0)
+#  define munit_atomic_load(src)                  (*(src))
+#  define munit_atomic_cas(dest, expected, value) InterlockedCompareExchange((dest), (value), *(expected))
+#else
+#  warning No atomic implementation, PRNG will not be thread-safe
+#  define munit_atomic_store(dest, value)         do { *(dest) = (value); } while (0)
+#  define munit_atomic_load(src)                  (*(src))
+static inline bool
+munit_atomic_cas(ATOMIC_UINT32_T* dest, ATOMIC_UINT32_T* expected, ATOMIC_UINT32_T desired) {
+  if (*dest == *expected) {
+    *dest = desired;
+    return true;
+  } else {
+    return false;
+  }
+}
+#endif
+
+#define MUNIT_PRNG_MULTIPLIER (747796405U)
+#define MUNIT_PRNG_INCREMENT  (1729U)
+
+static munit_uint32_t
+munit_rand_next_state(munit_uint32_t state) {
+  return state * MUNIT_PRNG_MULTIPLIER + MUNIT_PRNG_INCREMENT;
+}
+
+static munit_uint32_t
+munit_rand_from_state(munit_uint32_t state) {
+  munit_uint32_t res = ((state >> ((state >> 28) + 4)) ^ state) * (277803737U);
+  res ^= res >> 22;
+  return res;
+}
+
+void
+munit_rand_seed(munit_uint32_t seed) {
+  munit_uint32_t state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT);
+  munit_atomic_store(&munit_rand_state, state);
+}
+
+static munit_uint32_t
+munit_rand_generate_seed(void) {
+  munit_uint32_t seed, state;
+#if defined(MUNIT_ENABLE_TIMING)
+  struct PsnipClockTimespec wc = { 0, 0 };
+
+  psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wc);
+  seed = (munit_uint32_t) wc.nanoseconds;
+#else
+  seed = (munit_uint32_t) time(NULL);
+#endif
+
+  state = munit_rand_next_state(seed + MUNIT_PRNG_INCREMENT);
+  return munit_rand_from_state(state);
+}
+
+static munit_uint32_t
+munit_rand_state_uint32(munit_uint32_t* state) {
+  const munit_uint32_t old = *state;
+  *state = munit_rand_next_state(old);
+  return munit_rand_from_state(old);
+}
+
+munit_uint32_t
+munit_rand_uint32(void) {
+  munit_uint32_t old, state;
+
+  do {
+    old = munit_atomic_load(&munit_rand_state);
+    state = munit_rand_next_state(old);
+  } while (!munit_atomic_cas(&munit_rand_state, &old, state));
+
+  return munit_rand_from_state(old);
+}
+
+static void
+munit_rand_state_memory(munit_uint32_t* state, size_t size, munit_uint8_t data[MUNIT_ARRAY_PARAM(size)]) {
+  size_t members_remaining = size / sizeof(munit_uint32_t);
+  size_t bytes_remaining = size % sizeof(munit_uint32_t);
+  munit_uint8_t* b = data;
+  munit_uint32_t rv;
+  while (members_remaining-- > 0) {
+    rv = munit_rand_state_uint32(state);
+    memcpy(b, &rv, sizeof(munit_uint32_t));
+    b += sizeof(munit_uint32_t);
+  }
+  if (bytes_remaining != 0) {
+    rv = munit_rand_state_uint32(state);
+    memcpy(b, &rv, bytes_remaining);
+  }
+}
+
+void
+munit_rand_memory(size_t size, munit_uint8_t data[MUNIT_ARRAY_PARAM(size)]) {
+  munit_uint32_t old, state;
+
+  do {
+    state = old = munit_atomic_load(&munit_rand_state);
+    munit_rand_state_memory(&state, size, data);
+  } while (!munit_atomic_cas(&munit_rand_state, &old, state));
+}
+
+static munit_uint32_t
+munit_rand_state_at_most(munit_uint32_t* state, munit_uint32_t salt, munit_uint32_t max) {
+  /* We want (UINT32_MAX + 1) % max, which in unsigned arithmetic is the same
+   * as (UINT32_MAX + 1 - max) % max = -max % max. We compute -max using not
+   * to avoid compiler warnings.
+   */
+  const munit_uint32_t min = (~max + 1U) % max;
+  munit_uint32_t x;
+
+  if (max == (~((munit_uint32_t) 0U)))
+    return munit_rand_state_uint32(state) ^ salt;
+
+  max++;
+
+  do {
+    x = munit_rand_state_uint32(state) ^ salt;
+  } while (x < min);
+
+  return x % max;
+}
+
+static munit_uint32_t
+munit_rand_at_most(munit_uint32_t salt, munit_uint32_t max) {
+  munit_uint32_t old, state;
+  munit_uint32_t retval;
+
+  do {
+    state = old = munit_atomic_load(&munit_rand_state);
+    retval = munit_rand_state_at_most(&state, salt, max);
+  } while (!munit_atomic_cas(&munit_rand_state, &old, state));
+
+  return retval;
+}
+
+int
+munit_rand_int_range(int min, int max) {
+  munit_uint64_t range = (munit_uint64_t) max - (munit_uint64_t) min;
+
+  if (min > max)
+    return munit_rand_int_range(max, min);
+
+  if (range > (~((munit_uint32_t) 0U)))
+    range = (~((munit_uint32_t) 0U));
+
+  return min + munit_rand_at_most(0, (munit_uint32_t) range);
+}
+
+double
+munit_rand_double(void) {
+  munit_uint32_t old, state;
+  double retval = 0.0;
+
+  do {
+    state = old = munit_atomic_load(&munit_rand_state);
+
+    /* See http://mumble.net/~campbell/tmp/random_real.c for how to do
+     * this right.  Patches welcome if you feel that this is too
+     * biased. */
+    retval = munit_rand_state_uint32(&state) / ((~((munit_uint32_t) 0U)) + 1.0);
+  } while (!munit_atomic_cas(&munit_rand_state, &old, state));
+
+  return retval;
+}
+
+/*** Test suite handling ***/
+
+typedef struct {
+  unsigned int successful;
+  unsigned int skipped;
+  unsigned int failed;
+  unsigned int errored;
+#if defined(MUNIT_ENABLE_TIMING)
+  munit_uint64_t cpu_clock;
+  munit_uint64_t wall_clock;
+#endif
+} MunitReport;
+
+typedef struct {
+  const char* prefix;
+  const MunitSuite* suite;
+  const char** tests;
+  munit_uint32_t seed;
+  unsigned int iterations;
+  MunitParameter* parameters;
+  bool single_parameter_mode;
+  void* user_data;
+  MunitReport report;
+  bool colorize;
+  bool fork;
+  bool show_stderr;
+  bool fatal_failures;
+} MunitTestRunner;
+
+const char*
+munit_parameters_get(const MunitParameter params[], const char* key) {
+  const MunitParameter* param;
+
+  for (param = params ; param != NULL && param->name != NULL ; param++)
+    if (strcmp(param->name, key) == 0)
+      return param->value;
+  return NULL;
+}
+
+#if defined(MUNIT_ENABLE_TIMING)
+static void
+munit_print_time(FILE* fp, munit_uint64_t nanoseconds) {
+  fprintf(fp, "%" MUNIT_TEST_TIME_FORMAT, ((double) nanoseconds) / ((double) PSNIP_CLOCK_NSEC_PER_SEC));
+}
+#endif
+
+/* Add a parameter to an array of parameters. */
+static MunitResult
+munit_parameters_add(size_t* params_size, MunitParameter* params[MUNIT_ARRAY_PARAM(*params_size)], char* name, char* value) {
+  *params = realloc(*params, sizeof(MunitParameter) * (*params_size + 2));
+  if (*params == NULL)
+    return MUNIT_ERROR;
+
+  (*params)[*params_size].name = name;
+  (*params)[*params_size].value = value;
+  (*params_size)++;
+  (*params)[*params_size].name = NULL;
+  (*params)[*params_size].value = NULL;
+
+  return MUNIT_OK;
+}
+
+/* Concatenate two strings, but just return one of the components
+ * unaltered if the other is NULL or "". */
+static char*
+munit_maybe_concat(size_t* len, char* prefix, char* suffix) {
+  char* res;
+  size_t res_l;
+  const size_t prefix_l = prefix != NULL ? strlen(prefix) : 0;
+  const size_t suffix_l = suffix != NULL ? strlen(suffix) : 0;
+  if (prefix_l == 0 && suffix_l == 0) {
+    res = NULL;
+    res_l = 0;
+  } else if (prefix_l == 0 && suffix_l != 0) {
+    res = suffix;
+    res_l = suffix_l;
+  } else if (prefix_l != 0 && suffix_l == 0) {
+    res = prefix;
+    res_l = prefix_l;
+  } else {
+    res_l = prefix_l + suffix_l;
+    res = malloc(res_l + 1);
+    memcpy(res, prefix, prefix_l);
+    memcpy(res + prefix_l, suffix, suffix_l);
+    res[res_l] = 0;
+  }
+
+  if (len != NULL)
+    *len = res_l;
+
+  return res;
+}
+
+/* Possibly free a string returned by munit_maybe_concat. */
+static void
+munit_maybe_free_concat(char* s, const char* prefix, const char* suffix) {
+  if (prefix != s && suffix != s)
+    free(s);
+}
+
+/* Cheap string hash function, just used to salt the PRNG. */
+static munit_uint32_t
+munit_str_hash(const char* name) {
+  const char *p;
+  munit_uint32_t h = 5381U;
+
+  for (p = name; *p != '\0'; p++)
+    h = (h << 5) + h + *p;
+
+  return h;
+}
+
+static void
+munit_splice(int from, int to) {
+  munit_uint8_t buf[1024];
+#if !defined(_WIN32)
+  ssize_t len;
+  ssize_t bytes_written;
+  ssize_t write_res;
+#else
+  int len;
+  int bytes_written;
+  int write_res;
+#endif
+  do {
+    len = read(from, buf, sizeof(buf));
+    if (len > 0) {
+      bytes_written = 0;
+      do {
+        write_res = write(to, buf + bytes_written, len - bytes_written);
+        if (write_res < 0)
+          break;
+        bytes_written += write_res;
+      } while (bytes_written < len);
+    }
+    else
+      break;
+  } while (true);
+}
+
+/* This is the part that should be handled in the child process */
+static MunitResult
+munit_test_runner_exec(MunitTestRunner* runner, const MunitTest* test, const MunitParameter params[], MunitReport* report) {
+  unsigned int iterations = runner->iterations;
+  MunitResult result = MUNIT_FAIL;
+#if defined(MUNIT_ENABLE_TIMING)
+  struct PsnipClockTimespec wall_clock_begin = { 0, 0 }, wall_clock_end = { 0, 0 };
+  struct PsnipClockTimespec cpu_clock_begin = { 0, 0 }, cpu_clock_end = { 0, 0 };
+#endif
+  unsigned int i = 0;
+
+  if ((test->options & MUNIT_TEST_OPTION_SINGLE_ITERATION) == MUNIT_TEST_OPTION_SINGLE_ITERATION)
+    iterations = 1;
+  else if (iterations == 0)
+    iterations = runner->suite->iterations;
+
+  munit_rand_seed(runner->seed);
+
+  do {
+    void* data = (test->setup == NULL) ? runner->user_data : test->setup(params, runner->user_data);
+
+#if defined(MUNIT_ENABLE_TIMING)
+    psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_begin);
+    psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_begin);
+#endif
+
+#if defined(MUNIT_THREAD_LOCAL) && defined(MUNIT_ALWAYS_TEAR_DOWN)
+  if (test->tear_down != NULL) {
+    if (MUNIT_UNLIKELY(setjmp(munit_tear_down_jmp_buf) != 0)) {
+      test->tear_down(data);
+      longjmp(munit_error_jmp_buf, 1);
+    } else {
+      munit_tear_down_jmp_buf_valid = true;
+    }
+  }
+#endif
+
+    result = test->test(params, data);
+
+#if defined(MUNIT_ENABLE_TIMING)
+    psnip_clock_get_time(PSNIP_CLOCK_TYPE_WALL, &wall_clock_end);
+    psnip_clock_get_time(PSNIP_CLOCK_TYPE_CPU, &cpu_clock_end);
+#endif
+
+    if (test->tear_down != NULL)
+      test->tear_down(data);
+
+    if (MUNIT_LIKELY(result == MUNIT_OK)) {
+      report->successful++;
+#if defined(MUNIT_ENABLE_TIMING)
+      report->wall_clock += munit_clock_get_elapsed(&wall_clock_begin, &wall_clock_end);
+      report->cpu_clock += munit_clock_get_elapsed(&cpu_clock_begin, &cpu_clock_end);
+#endif
+    } else {
+      switch ((int) result) {
+        case MUNIT_SKIP:
+          report->skipped++;
+          break;
+        case MUNIT_FAIL:
+          report->failed++;
+          break;
+        case MUNIT_ERROR:
+          report->errored++;
+          break;
+        default:
+          break;
+      }
+      break;
+    }
+  } while (++i < iterations);
+
+  return result;
+}
+
+#if defined(MUNIT_EMOTICON)
+#  define MUNIT_RESULT_STRING_OK    ":)"
+#  define MUNIT_RESULT_STRING_SKIP  ":|"
+#  define MUNIT_RESULT_STRING_FAIL  ":("
+#  define MUNIT_RESULT_STRING_ERROR ":o"
+#  define MUNIT_RESULT_STRING_TODO  ":/"
+#else
+#  define MUNIT_RESULT_STRING_OK    "OK   "
+#  define MUNIT_RESULT_STRING_SKIP  "SKIP "
+#  define MUNIT_RESULT_STRING_FAIL  "FAIL "
+#  define MUNIT_RESULT_STRING_ERROR "ERROR"
+#  define MUNIT_RESULT_STRING_TODO  "TODO "
+#endif
+
+static void
+munit_test_runner_print_color(const MunitTestRunner* runner, const char* string, char color) {
+  if (runner->colorize)
+    fprintf(MUNIT_OUTPUT_FILE, "\x1b[3%cm%s\x1b[39m", color, string);
+  else
+    fputs(string, MUNIT_OUTPUT_FILE);
+}
+
+#if !defined(MUNIT_NO_BUFFER)
+static int
+munit_replace_stderr(FILE* stderr_buf) {
+  if (stderr_buf != NULL) {
+    const int orig_stderr = dup(STDERR_FILENO);
+
+    int errfd = fileno(stderr_buf);
+    if (MUNIT_UNLIKELY(errfd == -1)) {
+      exit(EXIT_FAILURE);
+    }
+
+    dup2(errfd, STDERR_FILENO);
+
+    return orig_stderr;
+  }
+
+  return -1;
+}
+
+static void
+munit_restore_stderr(int orig_stderr) {
+  if (orig_stderr != -1) {
+    dup2(orig_stderr, STDERR_FILENO);
+    close(orig_stderr);
+  }
+}
+#endif /* !defined(MUNIT_NO_BUFFER) */
+
+/* Run a test with the specified parameters. */
+static void
+munit_test_runner_run_test_with_params(MunitTestRunner* runner, const MunitTest* test, const MunitParameter params[]) {
+  MunitResult result = MUNIT_OK;
+  MunitReport report = {
+    0, 0, 0, 0,
+#if defined(MUNIT_ENABLE_TIMING)
+    0, 0
+#endif
+  };
+  unsigned int output_l;
+  bool first;
+  const MunitParameter* param;
+  FILE* stderr_buf;
+#if !defined(MUNIT_NO_FORK)
+  int pipefd[2];
+  pid_t fork_pid;
+  ssize_t bytes_written = 0;
+  ssize_t write_res;
+  ssize_t bytes_read = 0;
+  ssize_t read_res;
+  int status = 0;
+  pid_t changed_pid;
+#endif
+
+  if (params != NULL) {
+    output_l = 2;
+    fputs("  ", MUNIT_OUTPUT_FILE);
+    first = true;
+    for (param = params ; param != NULL && param->name != NULL ; param++) {
+      if (!first) {
+        fputs(", ", MUNIT_OUTPUT_FILE);
+        output_l += 2;
+      } else {
+        first = false;
+      }
+
+      output_l += fprintf(MUNIT_OUTPUT_FILE, "%s=%s", param->name, param->value);
+    }
+    while (output_l++ < MUNIT_TEST_NAME_LEN) {
+      fputc(' ', MUNIT_OUTPUT_FILE);
+    }
+  }
+
+  fflush(MUNIT_OUTPUT_FILE);
+
+  stderr_buf = NULL;
+#if !defined(_WIN32) || defined(__MINGW32__)
+  stderr_buf = tmpfile();
+#else
+  tmpfile_s(&stderr_buf);
+#endif
+  if (stderr_buf == NULL) {
+    munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to create buffer for stderr");
+    result = MUNIT_ERROR;
+    goto print_result;
+  }
+
+#if !defined(MUNIT_NO_FORK)
+  if (runner->fork) {
+    pipefd[0] = -1;
+    pipefd[1] = -1;
+    if (pipe(pipefd) != 0) {
+      munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to create pipe");
+      result = MUNIT_ERROR;
+      goto print_result;
+    }
+
+    fork_pid = fork();
+    if (fork_pid == 0) {
+      int orig_stderr;
+
+      close(pipefd[0]);
+
+      orig_stderr = munit_replace_stderr(stderr_buf);
+      munit_test_runner_exec(runner, test, params, &report);
+
+      /* Note that we don't restore stderr.  This is so we can buffer
+       * things written to stderr later on (such as by
+       * asan/tsan/ubsan, valgrind, etc.) */
+      close(orig_stderr);
+
+      do {
+        write_res = write(pipefd[1], ((munit_uint8_t*) (&report)) + bytes_written, sizeof(report) - bytes_written);
+        if (write_res < 0) {
+          if (stderr_buf != NULL) {
+            munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to write to pipe");
+          }
+          exit(EXIT_FAILURE);
+        }
+        bytes_written += write_res;
+      } while ((size_t) bytes_written < sizeof(report));
+
+      if (stderr_buf != NULL)
+        fclose(stderr_buf);
+      close(pipefd[1]);
+
+      exit(EXIT_SUCCESS);
+    } else if (fork_pid == -1) {
+      close(pipefd[0]);
+      close(pipefd[1]);
+      if (stderr_buf != NULL) {
+        munit_log_errno(MUNIT_LOG_ERROR, stderr, "unable to fork");
+      }
+      report.errored++;
+      result = MUNIT_ERROR;
+    } else {
+      close(pipefd[1]);
+      do {
+        read_res = read(pipefd[0], ((munit_uint8_t*) (&report)) + bytes_read, sizeof(report) - bytes_read);
+        if (read_res < 1)
+          break;
+        bytes_read += read_res;
+      } while (bytes_read < (ssize_t) sizeof(report));
+
+      changed_pid = waitpid(fork_pid, &status, 0);
+
+      if (MUNIT_LIKELY(changed_pid == fork_pid) && MUNIT_LIKELY(WIFEXITED(status))) {
+        if (bytes_read != sizeof(report)) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child exited unexpectedly with status %d", WEXITSTATUS(status));
+          report.errored++;
+        } else if (WEXITSTATUS(status) != EXIT_SUCCESS) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child exited with status %d", WEXITSTATUS(status));
+          report.errored++;
+        }
+      } else {
+        if (WIFSIGNALED(status)) {
+#if defined(_XOPEN_VERSION) && (_XOPEN_VERSION >= 700)
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child killed by signal %d (%s)", WTERMSIG(status), strsignal(WTERMSIG(status)));
+#else
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child killed by signal %d", WTERMSIG(status));
+#endif
+        } else if (WIFSTOPPED(status)) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr_buf, "child stopped by signal %d", WSTOPSIG(status));
+        }
+        report.errored++;
+      }
+
+      close(pipefd[0]);
+      waitpid(fork_pid, NULL, 0);
+    }
+  } else
+#endif
+  {
+#if !defined(MUNIT_NO_BUFFER)
+    const volatile int orig_stderr = munit_replace_stderr(stderr_buf);
+#endif
+
+#if defined(MUNIT_THREAD_LOCAL)
+    if (MUNIT_UNLIKELY(setjmp(munit_error_jmp_buf) != 0)) {
+      result = MUNIT_FAIL;
+      report.failed++;
+    } else {
+      munit_error_jmp_buf_valid = true;
+      result = munit_test_runner_exec(runner, test, params, &report);
+    }
+#else
+    result = munit_test_runner_exec(runner, test, params, &report);
+#endif
+
+#if !defined(MUNIT_NO_BUFFER)
+    munit_restore_stderr(orig_stderr);
+#endif
+
+    /* Here just so that the label is used on Windows and we don't get
+     * a warning */
+    goto print_result;
+  }
+
+ print_result:
+
+  fputs("[ ", MUNIT_OUTPUT_FILE);
+  if ((test->options & MUNIT_TEST_OPTION_TODO) == MUNIT_TEST_OPTION_TODO) {
+    if (report.failed != 0 || report.errored != 0 || report.skipped != 0) {
+      munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_TODO, '3');
+      result = MUNIT_OK;
+    } else {
+      munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1');
+      if (MUNIT_LIKELY(stderr_buf != NULL))
+        munit_log_internal(MUNIT_LOG_ERROR, stderr_buf, "Test marked TODO, but was successful.");
+      runner->report.failed++;
+      result = MUNIT_ERROR;
+    }
+  } else if (report.failed > 0) {
+    munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_FAIL, '1');
+    runner->report.failed++;
+    result = MUNIT_FAIL;
+  } else if (report.errored > 0) {
+    munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_ERROR, '1');
+    runner->report.errored++;
+    result = MUNIT_ERROR;
+  } else if (report.skipped > 0) {
+    munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_SKIP, '3');
+    runner->report.skipped++;
+    result = MUNIT_SKIP;
+  } else if (report.successful > 1) {
+    munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2');
+#if defined(MUNIT_ENABLE_TIMING)
+    fputs(" ] [ ", MUNIT_OUTPUT_FILE);
+    munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock / report.successful);
+    fputs(" / ", MUNIT_OUTPUT_FILE);
+    munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock / report.successful);
+    fprintf(MUNIT_OUTPUT_FILE, " CPU ]\n  %-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s Total: [ ", "");
+    munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock);
+    fputs(" / ", MUNIT_OUTPUT_FILE);
+    munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock);
+    fputs(" CPU", MUNIT_OUTPUT_FILE);
+#endif
+    runner->report.successful++;
+    result = MUNIT_OK;
+  } else if (report.successful > 0) {
+    munit_test_runner_print_color(runner, MUNIT_RESULT_STRING_OK, '2');
+#if defined(MUNIT_ENABLE_TIMING)
+    fputs(" ] [ ", MUNIT_OUTPUT_FILE);
+    munit_print_time(MUNIT_OUTPUT_FILE, report.wall_clock);
+    fputs(" / ", MUNIT_OUTPUT_FILE);
+    munit_print_time(MUNIT_OUTPUT_FILE, report.cpu_clock);
+    fputs(" CPU", MUNIT_OUTPUT_FILE);
+#endif
+    runner->report.successful++;
+    result = MUNIT_OK;
+  }
+  fputs(" ]\n", MUNIT_OUTPUT_FILE);
+
+  if (stderr_buf != NULL) {
+    if (result == MUNIT_FAIL || result == MUNIT_ERROR || runner->show_stderr) {
+      fflush(MUNIT_OUTPUT_FILE);
+
+      rewind(stderr_buf);
+      munit_splice(fileno(stderr_buf), STDERR_FILENO);
+
+      fflush(stderr);
+    }
+
+    fclose(stderr_buf);
+  }
+}
+
+static void
+munit_test_runner_run_test_wild(MunitTestRunner* runner,
+                                const MunitTest* test,
+                                const char* test_name,
+                                MunitParameter* params,
+                                MunitParameter* p) {
+  const MunitParameterEnum* pe;
+  char** values;
+  MunitParameter* next;
+
+  for (pe = test->parameters ; pe != NULL && pe->name != NULL ; pe++) {
+    if (p->name == pe->name)
+      break;
+  }
+
+  if (pe == NULL)
+    return;
+
+  for (values = pe->values ; *values != NULL ; values++) {
+    next = p + 1;
+    p->value = *values;
+    if (next->name == NULL) {
+      munit_test_runner_run_test_with_params(runner, test, params);
+    } else {
+      munit_test_runner_run_test_wild(runner, test, test_name, params, next);
+    }
+    if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0))
+      break;
+  }
+}
+
+/* Run a single test, with every combination of parameters
+ * requested. */
+static void
+munit_test_runner_run_test(MunitTestRunner* runner,
+                           const MunitTest* test,
+                           const char* prefix) {
+  char* test_name = munit_maybe_concat(NULL, (char*) prefix, (char*) test->name);
+  /* The array of parameters to pass to
+   * munit_test_runner_run_test_with_params */
+  MunitParameter* params = NULL;
+  size_t params_l = 0;
+  /* Wildcard parameters are parameters which have possible values
+   * specified in the test, but no specific value was passed to the
+   * CLI.  That means we want to run the test once for every
+   * possible combination of parameter values or, if --single was
+   * passed to the CLI, a single time with a random set of
+   * parameters. */
+  MunitParameter* wild_params = NULL;
+  size_t wild_params_l = 0;
+  const MunitParameterEnum* pe;
+  const MunitParameter* cli_p;
+  bool filled;
+  unsigned int possible;
+  char** vals;
+  size_t first_wild;
+  const MunitParameter* wp;
+  int pidx;
+
+  munit_rand_seed(runner->seed);
+
+  fprintf(MUNIT_OUTPUT_FILE, "%-" MUNIT_XSTRINGIFY(MUNIT_TEST_NAME_LEN) "s", test_name);
+
+  if (test->parameters == NULL) {
+    /* No parameters.  Simple, nice. */
+    munit_test_runner_run_test_with_params(runner, test, NULL);
+  } else {
+    fputc('\n', MUNIT_OUTPUT_FILE);
+
+    for (pe = test->parameters ; pe != NULL && pe->name != NULL ; pe++) {
+      /* Did we received a value for this parameter from the CLI? */
+      filled = false;
+      for (cli_p = runner->parameters ; cli_p != NULL && cli_p->name != NULL ; cli_p++) {
+        if (strcmp(cli_p->name, pe->name) == 0) {
+          if (MUNIT_UNLIKELY(munit_parameters_add(&params_l, &params, pe->name, cli_p->value) != MUNIT_OK))
+            goto cleanup;
+          filled = true;
+          break;
+        }
+      }
+      if (filled)
+        continue;
+
+      /* Nothing from CLI, is the enum NULL/empty?  We're not a
+       * fuzzer… */
+      if (pe->values == NULL || pe->values[0] == NULL)
+        continue;
+
+      /* If --single was passed to the CLI, choose a value from the
+       * list of possibilities randomly. */
+      if (runner->single_parameter_mode) {
+        possible = 0;
+        for (vals = pe->values ; *vals != NULL ; vals++)
+          possible++;
+        /* We want the tests to be reproducible, even if you're only
+         * running a single test, but we don't want every test with
+         * the same number of parameters to choose the same parameter
+         * number, so use the test name as a primitive salt. */
+        pidx = munit_rand_at_most(munit_str_hash(test_name), possible - 1);
+        if (MUNIT_UNLIKELY(munit_parameters_add(&params_l, &params, pe->name, pe->values[pidx]) != MUNIT_OK))
+          goto cleanup;
+      } else {
+        /* We want to try every permutation.  Put in a placeholder
+         * entry, we'll iterate through them later. */
+        if (MUNIT_UNLIKELY(munit_parameters_add(&wild_params_l, &wild_params, pe->name, NULL) != MUNIT_OK))
+          goto cleanup;
+      }
+    }
+
+    if (wild_params_l != 0) {
+      first_wild = params_l;
+      for (wp = wild_params ; wp != NULL && wp->name != NULL ; wp++) {
+        for (pe = test->parameters ; pe != NULL && pe->name != NULL && pe->values != NULL ; pe++) {
+          if (strcmp(wp->name, pe->name) == 0) {
+            if (MUNIT_UNLIKELY(munit_parameters_add(&params_l, &params, pe->name, pe->values[0]) != MUNIT_OK))
+              goto cleanup;
+          }
+        }
+      }
+
+      munit_test_runner_run_test_wild(runner, test, test_name, params, params + first_wild);
+    } else {
+      munit_test_runner_run_test_with_params(runner, test, params);
+    }
+
+  cleanup:
+    free(params);
+    free(wild_params);
+  }
+
+  munit_maybe_free_concat(test_name, prefix, test->name);
+}
+
+/* Recurse through the suite and run all the tests.  If a list of
+ * tests to run was provided on the command line, run only those
+ * tests.  */
+static void
+munit_test_runner_run_suite(MunitTestRunner* runner,
+                            const MunitSuite* suite,
+                            const char* prefix) {
+  size_t pre_l;
+  char* pre = munit_maybe_concat(&pre_l, (char*) prefix, (char*) suite->prefix);
+  const MunitTest* test;
+  const char** test_name;
+  const MunitSuite* child_suite;
+
+  /* Run the tests. */
+  for (test = suite->tests ; test != NULL && test->test != NULL ; test++) {
+    if (runner->tests != NULL) { /* Specific tests were requested on the CLI */
+      for (test_name = runner->tests ; test_name != NULL && *test_name != NULL ; test_name++) {
+        if ((pre_l == 0 || strncmp(pre, *test_name, pre_l) == 0) &&
+            strncmp(test->name, *test_name + pre_l, strlen(*test_name + pre_l)) == 0) {
+          munit_test_runner_run_test(runner, test, pre);
+          if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0))
+            goto cleanup;
+        }
+      }
+    } else { /* Run all tests */
+      munit_test_runner_run_test(runner, test, pre);
+    }
+  }
+
+  if (runner->fatal_failures && (runner->report.failed != 0 || runner->report.errored != 0))
+    goto cleanup;
+
+  /* Run any child suites. */
+  for (child_suite = suite->suites ; child_suite != NULL && child_suite->prefix != NULL ; child_suite++) {
+    munit_test_runner_run_suite(runner, child_suite, pre);
+  }
+
+ cleanup:
+
+  munit_maybe_free_concat(pre, prefix, suite->prefix);
+}
+
+static void
+munit_test_runner_run(MunitTestRunner* runner) {
+  munit_test_runner_run_suite(runner, runner->suite, NULL);
+}
+
+static void
+munit_print_help(int argc, char* const argv[MUNIT_ARRAY_PARAM(argc + 1)], void* user_data, const MunitArgument arguments[]) {
+  const MunitArgument* arg;
+  (void) argc;
+
+  printf("USAGE: %s [OPTIONS...] [TEST...]\n\n", argv[0]);
+  puts(" --seed SEED\n"
+       "           Value used to seed the PRNG.  Must be a 32-bit integer in decimal\n"
+       "           notation with no separators (commas, decimals, spaces, etc.), or\n"
+       "           hexadecimal prefixed by \"0x\".\n"
+       " --iterations N\n"
+       "           Run each test N times.  0 means the default number.\n"
+       " --param name value\n"
+       "           A parameter key/value pair which will be passed to any test with\n"
+       "           takes a parameter of that name.  If not provided, the test will be\n"
+       "           run once for each possible parameter value.\n"
+       " --list    Write a list of all available tests.\n"
+       " --list-params\n"
+       "           Write a list of all available tests and their possible parameters.\n"
+       " --single  Run each parameterized test in a single configuration instead of\n"
+       "           every possible combination\n"
+       " --log-visible debug|info|warning|error\n"
+       " --log-fatal debug|info|warning|error\n"
+       "           Set the level at which messages of different severities are visible,\n"
+       "           or cause the test to terminate.\n"
+#if !defined(MUNIT_NO_FORK)
+       " --no-fork Do not execute tests in a child process.  If this option is supplied\n"
+       "           and a test crashes (including by failing an assertion), no further\n"
+       "           tests will be performed.\n"
+#endif
+       " --fatal-failures\n"
+       "           Stop executing tests as soon as a failure is found.\n"
+       " --show-stderr\n"
+       "           Show data written to stderr by the tests, even if the test succeeds.\n"
+       " --color auto|always|never\n"
+       "           Colorize (or don't) the output.\n"
+     /* 12345678901234567890123456789012345678901234567890123456789012345678901234567890 */
+       " --help    Print this help message and exit.\n");
+#if defined(MUNIT_NL_LANGINFO)
+  setlocale(LC_ALL, "");
+  fputs((strcasecmp("UTF-8", nl_langinfo(CODESET)) == 0) ? "µnit" : "munit", stdout);
+#else
+  puts("munit");
+#endif
+  printf(" %d.%d.%d\n"
+         "Full documentation at: https://nemequ.github.io/munit/\n",
+         (MUNIT_CURRENT_VERSION >> 16) & 0xff,
+         (MUNIT_CURRENT_VERSION >> 8) & 0xff,
+         (MUNIT_CURRENT_VERSION >> 0) & 0xff);
+  for (arg = arguments ; arg != NULL && arg->name != NULL ; arg++)
+    arg->write_help(arg, user_data);
+}
+
+static const MunitArgument*
+munit_arguments_find(const MunitArgument arguments[], const char* name) {
+  const MunitArgument* arg;
+
+  for (arg = arguments ; arg != NULL && arg->name != NULL ; arg++)
+    if (strcmp(arg->name, name) == 0)
+      return arg;
+
+  return NULL;
+}
+
+static void
+munit_suite_list_tests(const MunitSuite* suite, bool show_params, const char* prefix) {
+  size_t pre_l;
+  char* pre = munit_maybe_concat(&pre_l, (char*) prefix, (char*) suite->prefix);
+  const MunitTest* test;
+  const MunitParameterEnum* params;
+  bool first;
+  char** val;
+  const MunitSuite* child_suite;
+
+  for (test = suite->tests ;
+       test != NULL && test->name != NULL ;
+       test++) {
+    if (pre != NULL)
+      fputs(pre, stdout);
+    puts(test->name);
+
+    if (show_params) {
+      for (params = test->parameters ;
+           params != NULL && params->name != NULL ;
+           params++) {
+        fprintf(stdout, " - %s: ", params->name);
+        if (params->values == NULL) {
+          puts("Any");
+        } else {
+          first = true;
+          for (val = params->values ;
+               *val != NULL ;
+               val++ ) {
+            if(!first) {
+              fputs(", ", stdout);
+            } else {
+              first = false;
+            }
+            fputs(*val, stdout);
+          }
+          putc('\n', stdout);
+        }
+      }
+    }
+  }
+
+  for (child_suite = suite->suites ; child_suite != NULL && child_suite->prefix != NULL ; child_suite++) {
+    munit_suite_list_tests(child_suite, show_params, pre);
+  }
+
+  munit_maybe_free_concat(pre, prefix, suite->prefix);
+}
+
+static bool
+munit_stream_supports_ansi(FILE *stream) {
+#if !defined(_WIN32)
+  return isatty(fileno(stream));
+#else
+
+#if !defined(__MINGW32__)
+  size_t ansicon_size = 0;
+#endif
+
+  if (isatty(fileno(stream))) {
+#if !defined(__MINGW32__)
+    getenv_s(&ansicon_size, NULL, 0, "ANSICON");
+    return ansicon_size != 0;
+#else
+    return getenv("ANSICON") != NULL;
+#endif
+  }
+  return false;
+#endif
+}
+
+int
+munit_suite_main_custom(const MunitSuite* suite, void* user_data,
+                        int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)],
+                        const MunitArgument arguments[]) {
+  int result = EXIT_FAILURE;
+  MunitTestRunner runner;
+  size_t parameters_size = 0;
+  size_t tests_size = 0;
+  int arg;
+
+  char* envptr;
+  unsigned long ts;
+  char* endptr;
+  unsigned long long iterations;
+  MunitLogLevel level;
+  const MunitArgument* argument;
+  const char** runner_tests;
+  unsigned int tests_run;
+  unsigned int tests_total;
+
+  runner.prefix = NULL;
+  runner.suite = NULL;
+  runner.tests = NULL;
+  runner.seed = 0;
+  runner.iterations = 0;
+  runner.parameters = NULL;
+  runner.single_parameter_mode = false;
+  runner.user_data = NULL;
+
+  runner.report.successful = 0;
+  runner.report.skipped = 0;
+  runner.report.failed = 0;
+  runner.report.errored = 0;
+#if defined(MUNIT_ENABLE_TIMING)
+  runner.report.cpu_clock = 0;
+  runner.report.wall_clock = 0;
+#endif
+
+  runner.colorize = false;
+#if !defined(_WIN32)
+  runner.fork = true;
+#else
+  runner.fork = false;
+#endif
+  runner.show_stderr = false;
+  runner.fatal_failures = false;
+  runner.suite = suite;
+  runner.user_data = user_data;
+  runner.seed = munit_rand_generate_seed();
+  runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE);
+
+  for (arg = 1 ; arg < argc ; arg++) {
+    if (strncmp("--", argv[arg], 2) == 0) {
+      if (strcmp("seed", argv[arg] + 2) == 0) {
+        if (arg + 1 >= argc) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]);
+          goto cleanup;
+        }
+
+        envptr = argv[arg + 1];
+        ts = strtoul(argv[arg + 1], &envptr, 0);
+        if (*envptr != '\0' || ts > (~((munit_uint32_t) 0U))) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]);
+          goto cleanup;
+        }
+        runner.seed = (munit_uint32_t) ts;
+
+        arg++;
+      } else if (strcmp("iterations", argv[arg] + 2) == 0) {
+        if (arg + 1 >= argc) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]);
+          goto cleanup;
+        }
+
+        endptr = argv[arg + 1];
+        iterations = strtoul(argv[arg + 1], &endptr, 0);
+        if (*endptr != '\0' || iterations > UINT_MAX) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]);
+          goto cleanup;
+        }
+
+        runner.iterations = (unsigned int) iterations;
+
+        arg++;
+      } else if (strcmp("param", argv[arg] + 2) == 0) {
+        if (arg + 2 >= argc) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires two arguments", argv[arg]);
+          goto cleanup;
+        }
+
+        runner.parameters = realloc(runner.parameters, sizeof(MunitParameter) * (parameters_size + 2));
+        if (runner.parameters == NULL) {
+          munit_log_internal(MUNIT_LOG_ERROR, stderr, "failed to allocate memory");
+          goto cleanup;
+        }
+        runner.parameters[parameters_size].name = (char*) argv[arg + 1];
+        runner.parameters[parameters_size].value = (char*) argv[arg + 2];
+        parameters_size++;
+        runner.parameters[parameters_size].name = NULL;
+        runner.parameters[parameters_size].value = NULL;
+        arg += 2;
+      } else if (strcmp("color", argv[arg] + 2) == 0) {
+        if (arg + 1 >= argc) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]);
+          goto cleanup;
+        }
+
+        if (strcmp(argv[arg + 1], "always") == 0)
+          runner.colorize = true;
+        else if (strcmp(argv[arg + 1], "never") == 0)
+          runner.colorize = false;
+        else if (strcmp(argv[arg + 1], "auto") == 0)
+          runner.colorize = munit_stream_supports_ansi(MUNIT_OUTPUT_FILE);
+        else {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]);
+          goto cleanup;
+        }
+
+        arg++;
+      } else if (strcmp("help", argv[arg] + 2) == 0) {
+        munit_print_help(argc, argv, user_data, arguments);
+        result = EXIT_SUCCESS;
+        goto cleanup;
+      } else if (strcmp("single", argv[arg] + 2) == 0) {
+        runner.single_parameter_mode = true;
+      } else if (strcmp("show-stderr", argv[arg] + 2) == 0) {
+        runner.show_stderr = true;
+#if !defined(_WIN32)
+      } else if (strcmp("no-fork", argv[arg] + 2) == 0) {
+        runner.fork = false;
+#endif
+      } else if (strcmp("fatal-failures", argv[arg] + 2) == 0) {
+        runner.fatal_failures = true;
+      } else if (strcmp("log-visible", argv[arg] + 2) == 0 ||
+                 strcmp("log-fatal", argv[arg] + 2) == 0) {
+        if (arg + 1 >= argc) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr, "%s requires an argument", argv[arg]);
+          goto cleanup;
+        }
+
+        if (strcmp(argv[arg + 1], "debug") == 0)
+          level = MUNIT_LOG_DEBUG;
+        else if (strcmp(argv[arg + 1], "info") == 0)
+          level = MUNIT_LOG_INFO;
+        else if (strcmp(argv[arg + 1], "warning") == 0)
+          level = MUNIT_LOG_WARNING;
+        else if (strcmp(argv[arg + 1], "error") == 0)
+          level = MUNIT_LOG_ERROR;
+        else {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr, "invalid value ('%s') passed to %s", argv[arg + 1], argv[arg]);
+          goto cleanup;
+        }
+
+        if (strcmp("log-visible", argv[arg] + 2) == 0)
+          munit_log_level_visible = level;
+        else
+          munit_log_level_fatal = level;
+
+        arg++;
+      } else if (strcmp("list", argv[arg] + 2) == 0) {
+        munit_suite_list_tests(suite, false, NULL);
+        result = EXIT_SUCCESS;
+        goto cleanup;
+      } else if (strcmp("list-params", argv[arg] + 2) == 0) {
+        munit_suite_list_tests(suite, true, NULL);
+        result = EXIT_SUCCESS;
+        goto cleanup;
+      } else {
+        argument = munit_arguments_find(arguments, argv[arg] + 2);
+        if (argument == NULL) {
+          munit_logf_internal(MUNIT_LOG_ERROR, stderr, "unknown argument ('%s')", argv[arg]);
+          goto cleanup;
+        }
+
+        if (!argument->parse_argument(suite, user_data, &arg, argc, argv))
+          goto cleanup;
+      }
+    } else {
+      runner_tests = realloc((void*) runner.tests, sizeof(char*) * (tests_size + 2));
+      if (runner_tests == NULL) {
+        munit_log_internal(MUNIT_LOG_ERROR, stderr, "failed to allocate memory");
+        goto cleanup;
+      }
+      runner.tests = runner_tests;
+      runner.tests[tests_size++] = argv[arg];
+      runner.tests[tests_size] = NULL;
+    }
+  }
+
+  fflush(stderr);
+  fprintf(MUNIT_OUTPUT_FILE, "Running test suite with seed 0x%08" PRIx32 "...\n", runner.seed);
+
+  munit_test_runner_run(&runner);
+
+  tests_run = runner.report.successful + runner.report.failed + runner.report.errored;
+  tests_total = tests_run + runner.report.skipped;
+  if (tests_run == 0) {
+    fprintf(stderr, "No tests run, %d (100%%) skipped.\n", runner.report.skipped);
+  } else {
+    fprintf(MUNIT_OUTPUT_FILE, "%d of %d (%0.0f%%) tests successful, %d (%0.0f%%) test skipped.\n",
+            runner.report.successful, tests_run,
+            (((double) runner.report.successful) / ((double) tests_run)) * 100.0,
+            runner.report.skipped,
+            (((double) runner.report.skipped) / ((double) tests_total)) * 100.0);
+  }
+
+  if (runner.report.failed == 0 && runner.report.errored == 0) {
+    result = EXIT_SUCCESS;
+  }
+
+ cleanup:
+  free(runner.parameters);
+  free((void*) runner.tests);
+
+  return result;
+}
+
+int
+munit_suite_main(const MunitSuite* suite, void* user_data,
+                 int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]) {
+  return munit_suite_main_custom(suite, user_data, argc, argv, NULL);
+}
diff --git a/test/raft/lib/munit.h b/test/raft/lib/munit.h
new file mode 100644
index 000000000..0b6796b4b
--- /dev/null
+++ b/test/raft/lib/munit.h
@@ -0,0 +1,535 @@
+/* µnit Testing Framework
+ * Copyright (c) 2013-2017 Evan Nemerson <evan@nemerson.com>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(MUNIT_H)
+#define MUNIT_H
+
+#include <stdarg.h>
+#include <stdlib.h>
+
+#define MUNIT_VERSION(major, minor, revision) \
+  (((major) << 16) | ((minor) << 8) | (revision))
+
+#define MUNIT_CURRENT_VERSION MUNIT_VERSION(0, 4, 1)
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600)
+#  define munit_int8_t   __int8
+#  define munit_uint8_t  unsigned __int8
+#  define munit_int16_t  __int16
+#  define munit_uint16_t unsigned __int16
+#  define munit_int32_t  __int32
+#  define munit_uint32_t unsigned __int32
+#  define munit_int64_t  __int64
+#  define munit_uint64_t unsigned __int64
+#else
+#  include <stdint.h>
+#  define munit_int8_t   int8_t
+#  define munit_uint8_t  uint8_t
+#  define munit_int16_t  int16_t
+#  define munit_uint16_t uint16_t
+#  define munit_int32_t  int32_t
+#  define munit_uint32_t uint32_t
+#  define munit_int64_t  int64_t
+#  define munit_uint64_t uint64_t
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1800)
+#  if !defined(PRIi8)
+#    define PRIi8 "i"
+#  endif
+#  if !defined(PRIi16)
+#    define PRIi16 "i"
+#  endif
+#  if !defined(PRIi32)
+#    define PRIi32 "i"
+#  endif
+#  if !defined(PRIi64)
+#    define PRIi64 "I64i"
+#  endif
+#  if !defined(PRId8)
+#    define PRId8 "d"
+#  endif
+#  if !defined(PRId16)
+#    define PRId16 "d"
+#  endif
+#  if !defined(PRId32)
+#    define PRId32 "d"
+#  endif
+#  if !defined(PRId64)
+#    define PRId64 "I64d"
+#  endif
+#  if !defined(PRIx8)
+#    define PRIx8 "x"
+#  endif
+#  if !defined(PRIx16)
+#    define PRIx16 "x"
+#  endif
+#  if !defined(PRIx32)
+#    define PRIx32 "x"
+#  endif
+#  if !defined(PRIx64)
+#    define PRIx64 "I64x"
+#  endif
+#  if !defined(PRIu8)
+#    define PRIu8 "u"
+#  endif
+#  if !defined(PRIu16)
+#    define PRIu16 "u"
+#  endif
+#  if !defined(PRIu32)
+#    define PRIu32 "u"
+#  endif
+#  if !defined(PRIu64)
+#    define PRIu64 "I64u"
+#  endif
+#  if !defined(bool)
+#    define bool int
+#  endif
+#  if !defined(true)
+#    define true (!0)
+#  endif
+#  if !defined(false)
+#    define false (!!0)
+#  endif
+#else
+#  include <inttypes.h>
+#  include <stdbool.h>
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__)
+#  define MUNIT_LIKELY(expr) (__builtin_expect ((expr), 1))
+#  define MUNIT_UNLIKELY(expr) (__builtin_expect ((expr), 0))
+#  define MUNIT_UNUSED __attribute__((__unused__))
+#else
+#  define MUNIT_LIKELY(expr) (expr)
+#  define MUNIT_UNLIKELY(expr) (expr)
+#  define MUNIT_UNUSED
+#endif
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__PGI)
+#  define MUNIT_ARRAY_PARAM(name) name
+#else
+#  define MUNIT_ARRAY_PARAM(name)
+#endif
+
+#if !defined(_WIN32)
+#  define MUNIT_SIZE_MODIFIER "z"
+#  define MUNIT_CHAR_MODIFIER "hh"
+#  define MUNIT_SHORT_MODIFIER "h"
+#else
+#  if defined(_M_X64) || defined(__amd64__)
+#    define MUNIT_SIZE_MODIFIER "I64"
+#  else
+#    define MUNIT_SIZE_MODIFIER ""
+#  endif
+#  define MUNIT_CHAR_MODIFIER ""
+#  define MUNIT_SHORT_MODIFIER ""
+#endif
+
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#  define MUNIT_NO_RETURN _Noreturn
+#elif defined(__GNUC__)
+#  define MUNIT_NO_RETURN __attribute__((__noreturn__))
+#elif defined(_MSC_VER)
+#  define MUNIT_NO_RETURN __declspec(noreturn)
+#else
+#  define MUNIT_NO_RETURN
+#endif
+
+#if defined(_MSC_VER) &&  (_MSC_VER >= 1500)
+#  define MUNIT__PUSH_DISABLE_MSVC_C4127 __pragma(warning(push)) __pragma(warning(disable:4127))
+#  define MUNIT__POP_DISABLE_MSVC_C4127 __pragma(warning(pop))
+#else
+#  define MUNIT__PUSH_DISABLE_MSVC_C4127
+#  define MUNIT__POP_DISABLE_MSVC_C4127
+#endif
+
+typedef enum {
+  MUNIT_LOG_DEBUG,
+  MUNIT_LOG_INFO,
+  MUNIT_LOG_WARNING,
+  MUNIT_LOG_ERROR
+} MunitLogLevel;
+
+#if defined(__GNUC__) && !defined(__MINGW32__)
+#  define MUNIT_PRINTF(string_index, first_to_check) __attribute__((format (printf, string_index, first_to_check)))
+#else
+#  define MUNIT_PRINTF(string_index, first_to_check)
+#endif
+
+MUNIT_PRINTF(4, 5)
+void munit_logf_ex(MunitLogLevel level, const char* filename, int line, const char* format, ...);
+
+#define munit_logf(level, format, ...) \
+  munit_logf_ex(level, __FILE__, __LINE__, format, __VA_ARGS__)
+
+#define munit_log(level, msg) \
+  munit_logf(level, "%s", msg)
+
+MUNIT_NO_RETURN
+MUNIT_PRINTF(3, 4)
+void munit_errorf_ex(const char* filename, int line, const char* format, ...);
+
+#define munit_errorf(format, ...) \
+  munit_errorf_ex(__FILE__, __LINE__, format, __VA_ARGS__)
+
+#define munit_error(msg) \
+  munit_errorf("%s", msg)
+
+#define munit_assert(expr) \
+  do { \
+    if (!MUNIT_LIKELY(expr)) { \
+      munit_error("assertion failed: " #expr); \
+    } \
+    MUNIT__PUSH_DISABLE_MSVC_C4127 \
+  } while (0) \
+  MUNIT__POP_DISABLE_MSVC_C4127
+
+#define munit_assert_true(expr) \
+  do { \
+    if (!MUNIT_LIKELY(expr)) { \
+      munit_error("assertion failed: " #expr " is not true"); \
+    } \
+    MUNIT__PUSH_DISABLE_MSVC_C4127 \
+  } while (0) \
+  MUNIT__POP_DISABLE_MSVC_C4127
+
+#define munit_assert_false(expr) \
+  do { \
+    if (!MUNIT_LIKELY(!(expr))) { \
+      munit_error("assertion failed: " #expr " is not false"); \
+    } \
+    MUNIT__PUSH_DISABLE_MSVC_C4127 \
+  } while (0) \
+  MUNIT__POP_DISABLE_MSVC_C4127
+
+#define munit_assert_type_full(prefix, suffix, T, fmt, a, op, b)   \
+  do { \
+    T munit_tmp_a_ = (a); \
+    T munit_tmp_b_ = (b); \
+    if (!(munit_tmp_a_ op munit_tmp_b_)) {                               \
+      munit_errorf("assertion failed: %s %s %s (" prefix "%" fmt suffix " %s " prefix "%" fmt suffix ")", \
+                   #a, #op, #b, munit_tmp_a_, #op, munit_tmp_b_); \
+    } \
+    MUNIT__PUSH_DISABLE_MSVC_C4127 \
+  } while (0) \
+  MUNIT__POP_DISABLE_MSVC_C4127
+
+#define munit_assert_type(T, fmt, a, op, b) \
+  munit_assert_type_full("", "", T, fmt, a, op, b)
+
+#define munit_assert_char(a, op, b) \
+  munit_assert_type_full("'\\x", "'", char, "02" MUNIT_CHAR_MODIFIER "x", a, op, b)
+#define munit_assert_uchar(a, op, b) \
+  munit_assert_type_full("'\\x", "'", unsigned char, "02" MUNIT_CHAR_MODIFIER "x", a, op, b)
+#define munit_assert_short(a, op, b) \
+  munit_assert_type(short, MUNIT_SHORT_MODIFIER "d", a, op, b)
+#define munit_assert_ushort(a, op, b) \
+  munit_assert_type(unsigned short, MUNIT_SHORT_MODIFIER "u", a, op, b)
+#define munit_assert_int(a, op, b) \
+  munit_assert_type(int, "d", a, op, b)
+#define munit_assert_uint(a, op, b) \
+  munit_assert_type(unsigned int, "u", a, op, b)
+#define munit_assert_long(a, op, b) \
+  munit_assert_type(long int, "ld", a, op, b)
+#define munit_assert_ulong(a, op, b) \
+  munit_assert_type(unsigned long int, "lu", a, op, b)
+#define munit_assert_llong(a, op, b) \
+  munit_assert_type(long long int, "lld", a, op, b)
+#define munit_assert_ullong(a, op, b) \
+  munit_assert_type(unsigned long long int, "llu", a, op, b)
+
+#define munit_assert_size(a, op, b) \
+  munit_assert_type(size_t, MUNIT_SIZE_MODIFIER "u", a, op, b)
+
+#define munit_assert_float(a, op, b) \
+  munit_assert_type(float, "f", a, op, b)
+#define munit_assert_double(a, op, b) \
+  munit_assert_type(double, "g", a, op, b)
+#define munit_assert_ptr(a, op, b) \
+  munit_assert_type(const void*, "p", a, op, b)
+
+#define munit_assert_int8(a, op, b)             \
+  munit_assert_type(munit_int8_t, PRIi8, a, op, b)
+#define munit_assert_uint8(a, op, b) \
+  munit_assert_type(munit_uint8_t, PRIu8, a, op, b)
+#define munit_assert_int16(a, op, b) \
+  munit_assert_type(munit_int16_t, PRIi16, a, op, b)
+#define munit_assert_uint16(a, op, b) \
+  munit_assert_type(munit_uint16_t, PRIu16, a, op, b)
+#define munit_assert_int32(a, op, b) \
+  munit_assert_type(munit_int32_t, PRIi32, a, op, b)
+#define munit_assert_uint32(a, op, b) \
+  munit_assert_type(munit_uint32_t, PRIu32, a, op, b)
+#define munit_assert_int64(a, op, b) \
+  munit_assert_type(munit_int64_t, PRIi64, a, op, b)
+#define munit_assert_uint64(a, op, b) \
+  munit_assert_type(munit_uint64_t, PRIu64, a, op, b)
+
+#define munit_assert_double_equal(a, b, precision) \
+  do { \
+    const double munit_tmp_a_ = (a); \
+    const double munit_tmp_b_ = (b); \
+    const double munit_tmp_diff_ = ((munit_tmp_a_ - munit_tmp_b_) < 0) ? \
+      -(munit_tmp_a_ - munit_tmp_b_) : \
+      (munit_tmp_a_ - munit_tmp_b_); \
+    if (MUNIT_UNLIKELY(munit_tmp_diff_ > 1e-##precision)) { \
+      munit_errorf("assertion failed: %s == %s (%0." #precision "g == %0." #precision "g)", \
+		   #a, #b, munit_tmp_a_, munit_tmp_b_); \
+    } \
+    MUNIT__PUSH_DISABLE_MSVC_C4127 \
+  } while (0) \
+  MUNIT__POP_DISABLE_MSVC_C4127
+
+#include <string.h>
+#define munit_assert_string_equal(a, b) \
+  do { \
+    const char* munit_tmp_a_ = a; \
+    const char* munit_tmp_b_ = b; \
+    if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) != 0)) { \
+      munit_errorf("assertion failed: string %s == %s (\"%s\" == \"%s\")", \
+                   #a, #b, munit_tmp_a_, munit_tmp_b_); \
+    } \
+    MUNIT__PUSH_DISABLE_MSVC_C4127 \
+  } while (0) \
+  MUNIT__POP_DISABLE_MSVC_C4127
+
+#define munit_assert_string_not_equal(a, b) \
+  do { \
+    const char* munit_tmp_a_ = a; \
+    const char* munit_tmp_b_ = b; \
+    if (MUNIT_UNLIKELY(strcmp(munit_tmp_a_, munit_tmp_b_) == 0)) { \
+      munit_errorf("assertion failed: string %s != %s (\"%s\" == \"%s\")", \
+                   #a, #b, munit_tmp_a_, munit_tmp_b_); \
+    } \
+    MUNIT__PUSH_DISABLE_MSVC_C4127 \
+  } while (0) \
+  MUNIT__POP_DISABLE_MSVC_C4127
+
+#define munit_assert_memory_equal(size, a, b) \
+  do { \
+    const unsigned char* munit_tmp_a_ = (const unsigned char*) (a); \
+    const unsigned char* munit_tmp_b_ = (const unsigned char*) (b); \
+    const size_t munit_tmp_size_ = (size); \
+    if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) != 0) { \
+      size_t munit_tmp_pos_; \
+      for (munit_tmp_pos_ = 0 ; munit_tmp_pos_ < munit_tmp_size_ ; munit_tmp_pos_++) { \
+        if (munit_tmp_a_[munit_tmp_pos_] != munit_tmp_b_[munit_tmp_pos_]) { \
+          munit_errorf("assertion failed: memory %s == %s, at offset %" MUNIT_SIZE_MODIFIER "u", \
+                       #a, #b, munit_tmp_pos_); \
+          break; \
+        } \
+      } \
+    } \
+    MUNIT__PUSH_DISABLE_MSVC_C4127 \
+  } while (0) \
+  MUNIT__POP_DISABLE_MSVC_C4127
+
+#define munit_assert_memory_not_equal(size, a, b) \
+  do { \
+    const unsigned char* munit_tmp_a_ = (const unsigned char*) (a); \
+    const unsigned char* munit_tmp_b_ = (const unsigned char*) (b); \
+    const size_t munit_tmp_size_ = (size); \
+    if (MUNIT_UNLIKELY(memcmp(munit_tmp_a_, munit_tmp_b_, munit_tmp_size_)) == 0) { \
+      munit_errorf("assertion failed: memory %s != %s (%zu bytes)", \
+                   #a, #b, munit_tmp_size_); \
+    } \
+    MUNIT__PUSH_DISABLE_MSVC_C4127 \
+  } while (0) \
+  MUNIT__POP_DISABLE_MSVC_C4127
+
+#define munit_assert_ptr_equal(a, b) \
+  munit_assert_ptr(a, ==, b)
+#define munit_assert_ptr_not_equal(a, b) \
+  munit_assert_ptr(a, !=, b)
+#define munit_assert_null(ptr) \
+  munit_assert_ptr(ptr, ==, NULL)
+#define munit_assert_not_null(ptr) \
+  munit_assert_ptr(ptr, !=, NULL)
+#define munit_assert_ptr_null(ptr) \
+  munit_assert_ptr(ptr, ==, NULL)
+#define munit_assert_ptr_not_null(ptr) \
+  munit_assert_ptr(ptr, !=, NULL)
+
+/*** Memory allocation ***/
+
+void* munit_malloc_ex(const char* filename, int line, size_t size);
+
+#define munit_malloc(size) \
+  munit_malloc_ex(__FILE__, __LINE__, (size))
+
+#define munit_new(type) \
+  ((type*) munit_malloc(sizeof(type)))
+
+#define munit_calloc(nmemb, size) \
+  munit_malloc((nmemb) * (size))
+
+#define munit_newa(type, nmemb) \
+  ((type*) munit_calloc((nmemb), sizeof(type)))
+
+/*** Random number generation ***/
+
+void munit_rand_seed(munit_uint32_t seed);
+munit_uint32_t munit_rand_uint32(void);
+int munit_rand_int_range(int min, int max);
+double munit_rand_double(void);
+void munit_rand_memory(size_t size, munit_uint8_t buffer[MUNIT_ARRAY_PARAM(size)]);
+
+/*** Tests and Suites ***/
+
+typedef enum {
+  /* Test successful */
+  MUNIT_OK,
+  /* Test failed */
+  MUNIT_FAIL,
+  /* Test was skipped */
+  MUNIT_SKIP,
+  /* Test failed due to circumstances not intended to be tested
+   * (things like network errors, invalid parameter value, failure to
+   * allocate memory in the test harness, etc.). */
+  MUNIT_ERROR
+} MunitResult;
+
+typedef struct {
+  char*  name;
+  char** values;
+} MunitParameterEnum;
+
+typedef struct {
+  char* name;
+  char* value;
+} MunitParameter;
+
+const char* munit_parameters_get(const MunitParameter params[], const char* key);
+
+typedef enum {
+  MUNIT_TEST_OPTION_NONE             = 0,
+  MUNIT_TEST_OPTION_SINGLE_ITERATION = 1 << 0,
+  MUNIT_TEST_OPTION_TODO             = 1 << 1
+} MunitTestOptions;
+
+typedef MunitResult (* MunitTestFunc)(const MunitParameter params[], void* user_data_or_fixture);
+typedef void*       (* MunitTestSetup)(const MunitParameter params[], void* user_data);
+typedef void        (* MunitTestTearDown)(void* fixture);
+
+typedef struct {
+  char*               name;
+  MunitTestFunc       test;
+  MunitTestSetup      setup;
+  MunitTestTearDown   tear_down;
+  MunitTestOptions    options;
+  MunitParameterEnum* parameters;
+} MunitTest;
+
+typedef enum {
+  MUNIT_SUITE_OPTION_NONE = 0
+} MunitSuiteOptions;
+
+typedef struct MunitSuite_ MunitSuite;
+
+struct MunitSuite_ {
+  char*             prefix;
+  MunitTest*        tests;
+  MunitSuite*       suites;
+  unsigned int      iterations;
+  MunitSuiteOptions options;
+};
+
+int munit_suite_main(const MunitSuite* suite, void* user_data, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]);
+
+/* Note: I'm not very happy with this API; it's likely to change if I
+ * figure out something better.  Suggestions welcome. */
+
+typedef struct MunitArgument_ MunitArgument;
+
+struct MunitArgument_ {
+  char* name;
+  bool (* parse_argument)(const MunitSuite* suite, void* user_data, int* arg, int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)]);
+  void (* write_help)(const MunitArgument* argument, void* user_data);
+};
+
+int munit_suite_main_custom(const MunitSuite* suite,
+                            void* user_data,
+                            int argc, char* const argv[MUNIT_ARRAY_PARAM(argc)],
+                            const MunitArgument arguments[]);
+
+#if defined(MUNIT_ENABLE_ASSERT_ALIASES)
+
+#define assert_true(expr) munit_assert_true(expr)
+#define assert_false(expr) munit_assert_false(expr)
+#define assert_char(a, op, b) munit_assert_char(a, op, b)
+#define assert_uchar(a, op, b) munit_assert_uchar(a, op, b)
+#define assert_short(a, op, b) munit_assert_short(a, op, b)
+#define assert_ushort(a, op, b) munit_assert_ushort(a, op, b)
+#define assert_int(a, op, b) munit_assert_int(a, op, b)
+#define assert_uint(a, op, b) munit_assert_uint(a, op, b)
+#define assert_long(a, op, b) munit_assert_long(a, op, b)
+#define assert_ulong(a, op, b) munit_assert_ulong(a, op, b)
+#define assert_llong(a, op, b) munit_assert_llong(a, op, b)
+#define assert_ullong(a, op, b) munit_assert_ullong(a, op, b)
+#define assert_size(a, op, b) munit_assert_size(a, op, b)
+#define assert_float(a, op, b) munit_assert_float(a, op, b)
+#define assert_double(a, op, b) munit_assert_double(a, op, b)
+#define assert_ptr(a, op, b) munit_assert_ptr(a, op, b)
+
+#define assert_int8(a, op, b) munit_assert_int8(a, op, b)
+#define assert_uint8(a, op, b) munit_assert_uint8(a, op, b)
+#define assert_int16(a, op, b) munit_assert_int16(a, op, b)
+#define assert_uint16(a, op, b) munit_assert_uint16(a, op, b)
+#define assert_int32(a, op, b) munit_assert_int32(a, op, b)
+#define assert_uint32(a, op, b) munit_assert_uint32(a, op, b)
+#define assert_int64(a, op, b) munit_assert_int64(a, op, b)
+#define assert_uint64(a, op, b) munit_assert_uint64(a, op, b)
+
+#define assert_double_equal(a, b, precision) munit_assert_double_equal(a, b, precision)
+#define assert_string_equal(a, b) munit_assert_string_equal(a, b)
+#define assert_string_not_equal(a, b) munit_assert_string_not_equal(a, b)
+#define assert_memory_equal(size, a, b) munit_assert_memory_equal(size, a, b)
+#define assert_memory_not_equal(size, a, b) munit_assert_memory_not_equal(size, a, b)
+#define assert_ptr_equal(a, b) munit_assert_ptr_equal(a, b)
+#define assert_ptr_not_equal(a, b) munit_assert_ptr_not_equal(a, b)
+#define assert_ptr_null(ptr) munit_assert_null_equal(ptr)
+#define assert_ptr_not_null(ptr) munit_assert_not_null(ptr)
+
+#define assert_null(ptr) munit_assert_null(ptr)
+#define assert_not_null(ptr) munit_assert_not_null(ptr)
+
+#endif /* defined(MUNIT_ENABLE_ASSERT_ALIASES) */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !defined(MUNIT_H) */
+
+#if defined(MUNIT_ENABLE_ASSERT_ALIASES)
+#  if defined(assert)
+#    undef assert
+#  endif
+#  define assert(expr) munit_assert(expr)
+#endif
diff --git a/test/raft/lib/runner.h b/test/raft/lib/runner.h
new file mode 100644
index 000000000..13244a33a
--- /dev/null
+++ b/test/raft/lib/runner.h
@@ -0,0 +1,113 @@
+/* Convenience macros to reduce munit boiler plate. */
+
+#ifndef TEST_RUNNER_H_
+#define TEST_RUNNER_H_
+
+#include "munit.h"
+
+/* Top-level suites array declaration.
+ *
+ * These top-level suites hold all module-level child suites and must be defined
+ * and then set as child suites of a root suite created at runtime by the test
+ * runner's main(). This can be done using the TEST_RUNNER macro. */
+extern MunitSuite _main_suites[];
+extern int _main_suites_n;
+
+/* Maximum number of test cases for each suite */
+#define SUITE__CAP 128
+
+/* Define the top-level suites array and the main() function of the test. */
+#define RUNNER(NAME)                                               \
+    MunitSuite _main_suites[SUITE__CAP];                           \
+    int _main_suites_n = 0;                                        \
+                                                                   \
+    int main(int argc, char *argv[MUNIT_ARRAY_PARAM(argc)])        \
+    {                                                              \
+        MunitSuite suite = {(char *)"", NULL, _main_suites, 1, 0}; \
+        return munit_suite_main(&suite, (void *)NAME, argc, argv); \
+    }
+
+/* Declare and register a new test suite #S belonging to the file's test module.
+ *
+ * A test suite is a pair of static variables:
+ *
+ * static MunitTest _##S##_suites[SUITE__CAP]
+ * static MunitTest _##S##_tests[SUITE__CAP]
+ *
+ * The tests and suites attributes of the next available MunitSuite slot in the
+ * _module_suites array will be set to the suite's tests and suites arrays, and
+ * the prefix attribute of the slot will be set to /S. */
+#define SUITE(S)      \
+    SUITE__DECLARE(S) \
+    SUITE__ADD_CHILD(main, #S, S)
+
+/* Declare and register a new test. */
+#define TEST(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS)                \
+    static MunitResult test_##S##_##C(const MunitParameter params[], \
+                                      void *data);                   \
+    TEST__ADD_TO_SUITE(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS)      \
+    static MunitResult test_##S##_##C(                               \
+        MUNIT_UNUSED const MunitParameter params[], MUNIT_UNUSED void *data)
+
+#define SKIP_IF_NO_FIXTURE \
+    if (f == NULL) {       \
+        return MUNIT_SKIP; \
+    }
+
+/* Declare the MunitSuite[] and the MunitTest[] arrays that compose the test
+ * suite identified by S. */
+#define SUITE__DECLARE(S)                                           \
+    static MunitSuite _##S##_suites[SUITE__CAP];                    \
+    static MunitTest _##S##_tests[SUITE__CAP];                      \
+    static MunitTestSetup _##S##_setup = NULL;                      \
+    static MunitTestTearDown _##S##_tear_down = NULL;               \
+    static int _##S##_suites_n = 0;                                 \
+    static int _##S##_tests_n = 0;                                  \
+    __attribute__((constructor(101))) static void _##S##_init(void) \
+    {                                                               \
+        memset(_##S##_suites, 0, sizeof(_##S##_suites));            \
+        memset(_##S##_tests, 0, sizeof(_##S##_tests));              \
+        (void)_##S##_suites_n;                                      \
+        (void)_##S##_tests_n;                                       \
+        (void)_##S##_setup;                                         \
+        (void)_##S##_tear_down;                                     \
+    }
+
+/* Set the tests and suites attributes of the next available slot of the
+ * MunitSuite[] array of S1 to the MunitTest[] and MunitSuite[] arrays of S2,
+ * using the given PREFIX. */
+#define SUITE__ADD_CHILD(S1, PREFIX, S2)                                    \
+    __attribute__((constructor(102))) static void _##S1##_##S2##_init(void) \
+    {                                                                       \
+        int n = _##S1##_suites_n;                                           \
+        _##S1##_suites[n].prefix = PREFIX;                                  \
+        _##S1##_suites[n].tests = _##S2##_tests;                            \
+        _##S1##_suites[n].suites = _##S2##_suites;                          \
+        _##S1##_suites[n].iterations = 0;                                   \
+        _##S1##_suites[n].options = 0;                                      \
+        _##S1##_suites_n = n + 1;                                           \
+    }
+
+/* Add a test case to the MunitTest[] array of suite S. */
+#define TEST__ADD_TO_SUITE(S, C, SETUP, TEAR_DOWN, OPTIONS, PARAMS)            \
+    __attribute__((constructor(103))) static void _##S##_tests_##C##_init(     \
+        void)                                                                  \
+    {                                                                          \
+        MunitTest *tests = _##S##_tests;                                       \
+        int n = _##S##_tests_n;                                                \
+        TEST__SET_IN_ARRAY(tests, n, "/" #C, test_##S##_##C, SETUP, TEAR_DOWN, \
+                           OPTIONS, PARAMS);                                   \
+        _##S##_tests_n = n + 1;                                                \
+    }
+
+/* Set the values of the I'th test case slot in the given test array */
+#define TEST__SET_IN_ARRAY(TESTS, I, NAME, FUNC, SETUP, TEAR_DOWN, OPTIONS, \
+                           PARAMS)                                          \
+    TESTS[I].name = NAME;                                                   \
+    TESTS[I].test = FUNC;                                                   \
+    TESTS[I].setup = SETUP;                                                 \
+    TESTS[I].tear_down = TEAR_DOWN;                                         \
+    TESTS[I].options = OPTIONS;                                             \
+    TESTS[I].parameters = PARAMS
+
+#endif /* TEST_RUNNER_H_ */
diff --git a/test/raft/lib/snapshot.h b/test/raft/lib/snapshot.h
new file mode 100644
index 000000000..4a6e8af10
--- /dev/null
+++ b/test/raft/lib/snapshot.h
@@ -0,0 +1,26 @@
+/**
+ * Raft snapshot test helpers.
+ */
+
+#ifndef TEST_SNAPSHOT_H
+#define TEST_SNAPSHOT_H
+
+#include "../../../src/raft.h"
+
+#include "../../../src/raft/configuration.h"
+
+/**
+ * Allocate and create the given snapshot, using the given @LAST_INDEX,
+ * @LAST_TERM, the given @CONF, and generating an FSM snapshot using @X and @Y.
+ */
+#define CREATE_SNAPSHOT(SNAPSHOT, LAST_INDEX, LAST_TERM, CONF, CONF_INDEX, X, \
+                        Y)                                                    \
+    SNAPSHOT = raft_malloc(sizeof *SNAPSHOT);                                 \
+    munit_assert_ptr_not_null(SNAPSHOT);                                      \
+    SNAPSHOT->index = LAST_INDEX;                                             \
+    SNAPSHOT->term = LAST_TERM;                                               \
+    SNAPSHOT->configuration = CONF;                                           \
+    SNAPSHOT->configuration_index = CONF_INDEX;                               \
+    FsmEncodeSnapshot(X, Y, &SNAPSHOT->bufs, &SNAPSHOT->n_bufs)
+
+#endif /* TEST_CONFIGURATION_H */
diff --git a/test/raft/lib/tcp.c b/test/raft/lib/tcp.c
new file mode 100644
index 000000000..02b305739
--- /dev/null
+++ b/test/raft/lib/tcp.c
@@ -0,0 +1,236 @@
+#include "tcp.h"
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+
+void TcpServerInit(struct TcpServer *s)
+{
+    struct sockaddr_in addr;
+    socklen_t size = sizeof addr;
+    int rv;
+
+    /* Initialize the socket address structure. */
+    memset(&addr, 0, size);
+
+    addr.sin_family = AF_INET;
+    addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+    addr.sin_port = 0; /* Get a random free port */
+
+    /* Create the server socket. */
+    s->socket = socket(AF_INET, SOCK_STREAM, 0);
+    if (s->socket == -1) {
+        munit_errorf("tcp server: socket(): %s", strerror(errno));
+    }
+
+    /* Bind the socket. */
+    rv = bind(s->socket, (struct sockaddr *)&addr, size);
+    if (rv == -1) {
+        munit_errorf("tcp server: bind(): %s", strerror(errno));
+    }
+
+    /* Start listening. */
+    rv = listen(s->socket, 1);
+    if (rv == -1) {
+        munit_errorf("tcp server: listen(): %s", strerror(errno));
+    }
+
+    /* Get the actual addressed assigned by the kernel and save it back in the
+     * relevant field. */
+    rv = getsockname(s->socket, (struct sockaddr *)&addr, &size);
+    if (rv != 0) {
+        munit_errorf("tcp: getsockname(): %s", strerror(errno));
+    }
+
+    s->port = htons(addr.sin_port);
+    sprintf(s->address, "127.0.0.1:%d", s->port);
+}
+
+void TcpServerClose(struct TcpServer *s)
+{
+    int rv;
+
+    if (s->socket == -1) {
+        return;
+    }
+
+    rv = close(s->socket);
+    if (rv == -1) {
+        munit_errorf("tcp server: close(): %s", strerror(errno));
+    }
+}
+
+int TcpServerAccept(struct TcpServer *s)
+{
+    int socket;
+    struct sockaddr_in address;
+    socklen_t size;
+
+    size = sizeof(address);
+
+    socket = accept(s->socket, (struct sockaddr *)&address, &size);
+    if (socket < 0) {
+        munit_errorf("tcp server: accept(): %s", strerror(errno));
+    }
+
+    return socket;
+}
+
+void TcpServerStop(struct TcpServer *s)
+{
+    int rv;
+
+    rv = close(s->socket);
+    if (rv == -1) {
+        munit_errorf("tcp server: close(): %s", strerror(errno));
+    }
+    s->socket = -1;
+}
+
+void test_tcp_setup(const MunitParameter params[], struct test_tcp *t)
+{
+    (void)params;
+    t->server.socket = -1;
+    t->client.socket = -1;
+}
+
+void test_tcp_tear_down(struct test_tcp *t)
+{
+    int rv;
+
+    if (t->server.socket != -1) {
+        rv = close(t->server.socket);
+        if (rv == -1) {
+            munit_errorf("tcp: close(): %s", strerror(errno));
+        }
+    }
+
+    if (t->client.socket != -1) {
+        rv = close(t->client.socket);
+        if (rv == -1) {
+            munit_errorf("tcp: close(): %s", strerror(errno));
+        }
+    }
+}
+
+void test_tcp_listen(struct test_tcp *t)
+{
+    struct sockaddr_in addr;
+    socklen_t size = sizeof addr;
+    int rv;
+
+    /* Initialize the socket address structure. */
+    memset(&addr, 0, size);
+
+    addr.sin_family = AF_INET;
+    addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+    addr.sin_port = 0; /* Get a random free port */
+
+    /* Create the server socket. */
+    t->server.socket = socket(AF_INET, SOCK_STREAM, 0);
+    if (t->server.socket == -1) {
+        munit_errorf("tcp: socket(): %s", strerror(errno));
+    }
+
+    /* Bind the socket. */
+    rv = bind(t->server.socket, (struct sockaddr *)&addr, size);
+    if (rv == -1) {
+        munit_errorf("tcp: bind(): %s", strerror(errno));
+    }
+
+    /* Start listening. */
+    rv = listen(t->server.socket, 1);
+    if (rv == -1) {
+        munit_errorf("tcp: listen(): %s", strerror(errno));
+    }
+
+    /* Get the actual addressed assigned by the kernel and save it back in
+     * the relevant test_socket__server field (pointed to by address). */
+    rv = getsockname(t->server.socket, (struct sockaddr *)&addr, &size);
+    if (rv != 0) {
+        munit_errorf("tcp: getsockname(): %s", strerror(errno));
+    }
+
+    sprintf(t->server.address, "127.0.0.1:%d", htons(addr.sin_port));
+}
+
+const char *test_tcp_address(struct test_tcp *t)
+{
+    return t->server.address;
+}
+
+void test_tcp_connect(struct test_tcp *t, int port)
+{
+    struct sockaddr_in addr;
+    int rv;
+
+    /* Create the client socket. */
+    t->client.socket = socket(AF_INET, SOCK_STREAM, 0);
+    if (t->client.socket == -1) {
+        munit_errorf("tcp: socket(): %s", strerror(errno));
+    }
+
+    /* Initialize the socket address structure. */
+    memset(&addr, 0, sizeof addr);
+    addr.sin_family = AF_INET;
+    addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+    addr.sin_port = htons(port);
+
+    /* Connect */
+    rv = connect(t->client.socket, (struct sockaddr *)&addr, sizeof addr);
+    if (rv == -1) {
+        munit_errorf("tcp: connect(): %s", strerror(errno));
+    }
+}
+
+void test_tcp_close(struct test_tcp *t)
+{
+    int rv;
+
+    rv = close(t->client.socket);
+    if (rv == -1) {
+        munit_errorf("tcp: close(): %s", strerror(errno));
+    }
+    t->client.socket = -1;
+}
+
+void test_tcp_stop(struct test_tcp *t)
+{
+    int rv;
+
+    rv = close(t->server.socket);
+    if (rv == -1) {
+        munit_errorf("tcp: close(): %s", strerror(errno));
+    }
+    t->server.socket = -1;
+}
+
+void test_tcp_send(struct test_tcp *t, const void *buf, int len)
+{
+    int rv;
+
+    rv = write(t->client.socket, buf, len);
+    if (rv == -1) {
+        munit_errorf("tcp: write(): %s", strerror(errno));
+    }
+    if (rv != len) {
+        munit_errorf("tcp: write(): only %d bytes written", rv);
+    }
+}
+
+int test_tcp_accept(struct test_tcp *t)
+{
+    int socket;
+    struct sockaddr_in address;
+    socklen_t size;
+
+    size = sizeof(address);
+
+    socket = accept(t->server.socket, (struct sockaddr *)&address, &size);
+    if (socket < 0) {
+        munit_errorf("tcp: accept(): %s", strerror(errno));
+    }
+
+    return socket;
+}
diff --git a/test/raft/lib/tcp.h b/test/raft/lib/tcp.h
new file mode 100644
index 000000000..c84b2241d
--- /dev/null
+++ b/test/raft/lib/tcp.h
@@ -0,0 +1,110 @@
+/* Test TCP utilities.
+ *
+ * This module sports helpers to create server or client sockets, and
+ * send/receive data through them.
+ */
+
+#ifndef TEST_TCP_H
+#define TEST_TCP_H
+
+#include "munit.h"
+
+/* Macro helpers. */
+#define FIXTURE_TCP_SERVER struct TcpServer server
+#define SETUP_TCP_SERVER TcpServerInit(&f->server)
+#define TEAR_DOWN_TCP_SERVER TcpServerClose(&f->server)
+
+#define TCP_SERVER_STOP TcpServerStop(&f->server)
+#define TCP_SERVER_PORT f->server.port
+#define TCP_SERVER_ADDRESS f->server.address
+
+#define FIXTURE_TCP struct test_tcp tcp
+#define SETUP_TCP test_tcp_setup(params, &f->tcp)
+#define TEAR_DOWN_TCP test_tcp_tear_down(&f->tcp)
+
+#define TCP_CLIENT_CONNECT(PORT) test_tcp_connect(&f->tcp, PORT)
+#define TCP_CLIENT_SEND(BUF, N) test_tcp_send(&f->tcp, BUF, N)
+#define TCP_CLIENT_CLOSE test_tcp_close(&f->tcp)
+
+struct TcpServer
+{
+    int socket; /* Socket listening to incoming connections */
+    int port;
+    char address[128]; /* IPv4 address of the server, with port */
+};
+
+void TcpServerInit(struct TcpServer *s);
+void TcpServerClose(struct TcpServer *s);
+
+/* Accept inbound client connection and return the relevant socket. */
+int TcpServerAccept(struct TcpServer *s);
+
+/* Close the server socket. */
+void TcpServerStop(struct TcpServer *s);
+
+struct TcpClient
+{
+    int socket; /* Socket connected to a server. */
+};
+
+void TcpClientInit(struct TcpClient *s);
+void TcpClientClose(struct TcpClient *s);
+
+/* Object that can be used to setup and control a TCP server and/or client. */
+struct test_tcp
+{
+    struct
+    {
+        int socket;        /* Socket listening to incoming connections */
+        char address[128]; /* IPv4 address of the server, with port */
+    } server;
+    struct
+    {
+        int socket; /* Socket connected to another host */
+    } client;
+};
+
+/**
+ * Bind the server socket of the given test TCP host to localhost and start
+ * listening to it.
+ */
+void test_tcp_setup(const MunitParameter params[], struct test_tcp *t);
+
+void test_tcp_tear_down(struct test_tcp *t);
+
+/**
+ * Start listening to a random free port on localhost.
+ */
+void test_tcp_listen(struct test_tcp *t);
+
+/**
+ * Return the address of the server socket created with @test_tcp_listen.
+ */
+const char *test_tcp_address(struct test_tcp *t);
+
+/**
+ * Connect the client socket to the given port on localhost.
+ */
+void test_tcp_connect(struct test_tcp *t, int port);
+
+/**
+ * Close the client socket.
+ */
+void test_tcp_close(struct test_tcp *t);
+
+/**
+ * Send data using the client socket.
+ */
+void test_tcp_send(struct test_tcp *t, const void *buf, int len);
+
+/**
+ * Accept inbound client connection and return the relevant socket.
+ */
+int test_tcp_accept(struct test_tcp *t);
+
+/**
+ * Close the server socket.
+ */
+void test_tcp_stop(struct test_tcp *t);
+
+#endif /* TEST_TCP_H */
diff --git a/test/raft/lib/uv.h b/test/raft/lib/uv.h
new file mode 100644
index 000000000..7fdcdd08b
--- /dev/null
+++ b/test/raft/lib/uv.h
@@ -0,0 +1,64 @@
+/* Helpers around the libuv-based implementation of the raft_io interface. */
+
+#ifndef TEST_UV_H
+#define TEST_UV_H
+
+#include "../../../src/raft.h"
+#include "dir.h"
+#include "heap.h"
+#include "loop.h"
+
+#define FIXTURE_UV_TRANSPORT struct raft_uv_transport transport
+#define SETUP_UV_TRANSPORT                               \
+    do {                                                 \
+        int rv_;                                         \
+        f->transport.version = 1;                        \
+        rv_ = raft_uv_tcp_init(&f->transport, &f->loop); \
+        munit_assert_int(rv_, ==, 0);                    \
+    } while (0)
+#define TEAR_DOWN_UV_TRANSPORT raft_uv_tcp_close(&f->transport)
+
+#define FIXTURE_UV_DEPS \
+    FIXTURE_DIR;        \
+    FIXTURE_HEAP;       \
+    FIXTURE_LOOP;       \
+    FIXTURE_UV_TRANSPORT
+#define SETUP_UV_DEPS \
+    SET_UP_DIR;       \
+    SET_UP_HEAP;      \
+    SETUP_LOOP;       \
+    SETUP_UV_TRANSPORT
+#define TEAR_DOWN_UV_DEPS   \
+    TEAR_DOWN_UV_TRANSPORT; \
+    TEAR_DOWN_LOOP;         \
+    TEAR_DOWN_HEAP;         \
+    TEAR_DOWN_DIR
+
+#define FIXTURE_UV struct raft_io io
+
+#define SETUP_UV                                                     \
+    do {                                                             \
+        int rv_;                                                     \
+        rv_ = raft_uv_init(&f->io, &f->loop, f->dir, &f->transport); \
+        munit_assert_int(rv_, ==, 0);                                \
+        raft_uv_set_auto_recovery(&f->io, false);                    \
+        rv_ = f->io.init(&f->io, 1, "127.0.0.1:9001");               \
+        munit_assert_int(rv_, ==, 0);                                \
+    } while (0)
+
+MUNIT_UNUSED static void uvCloseCb(struct raft_io *io)
+{
+    bool *closed = io->data;
+    *closed = true;
+}
+
+#define TEAR_DOWN_UV                    \
+    do {                                \
+        bool _closed = false;           \
+        f->io.data = &_closed;          \
+        f->io.close(&f->io, uvCloseCb); \
+        LOOP_RUN_UNTIL(&_closed);       \
+        raft_uv_close(&f->io);          \
+    } while (0)
+
+#endif /* TEST_UV_H */
diff --git a/test/raft/unit/main_core.c b/test/raft/unit/main_core.c
new file mode 100644
index 000000000..ad1798bba
--- /dev/null
+++ b/test/raft/unit/main_core.c
@@ -0,0 +1,3 @@
+#include "../lib/runner.h"
+
+RUNNER("core")
diff --git a/test/raft/unit/main_uv.c b/test/raft/unit/main_uv.c
new file mode 100644
index 000000000..7f2eba543
--- /dev/null
+++ b/test/raft/unit/main_uv.c
@@ -0,0 +1,3 @@
+#include "../lib/runner.h"
+
+RUNNER("uv")
diff --git a/test/raft/unit/test_byte.c b/test/raft/unit/test_byte.c
new file mode 100644
index 000000000..2ad2dd485
--- /dev/null
+++ b/test/raft/unit/test_byte.c
@@ -0,0 +1,179 @@
+#include <ctype.h>
+#include <stdio.h>
+
+#include "../../../src/raft/byte.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+#define CRC32(VALUE) byteCrc32(&(VALUE), sizeof VALUE, 0)
+
+/******************************************************************************
+ *
+ * byteCrc32
+ *
+ *****************************************************************************/
+
+SUITE(byteCrc32)
+
+/* The same data produces the same sum. */
+TEST(byteCrc32, valid, NULL, NULL, 0, NULL)
+{
+    uint64_t value1 = 123456789;
+    uint64_t value2 = 123456789;
+    munit_assert_int(CRC32(value1), ==, CRC32(value2));
+    return MUNIT_OK;
+}
+
+/* Different data produces a different sum. */
+TEST(byteCrc32, invalid, NULL, NULL, 0, NULL)
+{
+    uint64_t value1 = 123456789;
+    uint64_t value2 = 123466789;
+    munit_assert_int(CRC32(value1), !=, CRC32(value2));
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * Convert to little endian representation (least significant byte first).
+ *
+ *****************************************************************************/
+
+SUITE(byteFlip)
+
+/* Convert a 32-bit number. */
+TEST(byteFlip, 32, NULL, NULL, 0, NULL)
+{
+    uint32_t value;
+    unsigned i;
+    value = byteFlip32(0x03020100);
+    for (i = 0; i < 4; i++) {
+        munit_assert_int(*((uint8_t *)&value + i), ==, i);
+    }
+    return MUNIT_OK;
+}
+
+/* Convert a 64-bit number. */
+TEST(byteFlip, 64, NULL, NULL, 0, NULL)
+{
+    uint64_t value;
+    unsigned i;
+    value = byteFlip64(0x0706050403020100);
+    for (i = 0; i < 8; i++) {
+        munit_assert_int(*((uint8_t *)&value + i), ==, i);
+    }
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * byteGetString
+ *
+ *****************************************************************************/
+
+SUITE(byteGetString)
+
+TEST(byteGetString, success, NULL, NULL, 0, NULL)
+{
+    uint8_t buf[] = {'h', 'e', 'l', 'l', 'o', 0};
+    const void *cursor = buf;
+    munit_assert_string_equal(byteGetString(&cursor, sizeof buf), "hello");
+    munit_assert_ptr_equal(cursor, buf + sizeof buf);
+    return MUNIT_OK;
+}
+
+TEST(byteGetString, malformed, NULL, NULL, 0, NULL)
+{
+    uint8_t buf[] = {'h', 'e', 'l', 'l', 'o', 'w'};
+    const void *cursor = buf;
+    munit_assert_ptr_equal(byteGetString(&cursor, sizeof buf), NULL);
+    munit_assert_ptr_equal(cursor, buf);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * byteGet64
+ *
+ *****************************************************************************/
+
+SUITE(byteGet64)
+
+TEST(byteGet64, success, NULL, NULL, 0, NULL)
+{
+    uint8_t *buf = munit_malloc(sizeof(uint64_t) * 2);
+    void *cursor1 = buf + 1;
+    const void *cursor2 = buf + 1;
+    bytePut64(&cursor1, 1);
+    munit_assert_int(byteGet64(&cursor2), ==, 1);
+    free(buf);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * byteSha1
+ *
+ *****************************************************************************/
+
+/* Assert that the 20 bytes contained in VALUE match the given DIGEST
+ * hexadecimal representation. */
+#define ASSERT_SHA1(VALUE, DIGEST)                      \
+    do {                                                \
+        char _digest[41];                               \
+        unsigned _i;                                    \
+        for (_i = 0; _i < 20; _i++) {                   \
+            unsigned _j = _i * 2;                       \
+            sprintf(&_digest[_j], "%.2x", value[_i]);   \
+            _digest[_j] = toupper(_digest[_j]);         \
+            _digest[_j + 1] = toupper(_digest[_j + 1]); \
+        }                                               \
+        _digest[40] = '\0';                             \
+        munit_assert_string_equal(_digest, DIGEST);     \
+    } while (0)
+
+SUITE(byteSha1)
+
+TEST(byteSha1, abc, NULL, NULL, 0, NULL)
+{
+    struct byteSha1 sha1;
+    uint8_t text[] = "abc";
+    uint8_t value[20];
+    byteSha1Init(&sha1);
+    byteSha1Update(&sha1, text, sizeof text - 1);
+    byteSha1Digest(&sha1, value);
+    ASSERT_SHA1(value, "A9993E364706816ABA3E25717850C26C9CD0D89D");
+    return MUNIT_OK;
+}
+
+TEST(byteSha1, abcWithZeroLen, NULL, NULL, 0, NULL)
+{
+    struct byteSha1 sha1;
+    uint8_t text[] = "abc";
+    uint8_t garbage[] = "garbage";
+    uint8_t value[20];
+    byteSha1Init(&sha1);
+    byteSha1Update(&sha1, text, sizeof text - 1);
+    /* Update with 0 length buffer doesn't change digest */
+    byteSha1Update(&sha1, garbage, 0);
+    byteSha1Digest(&sha1, value);
+    ASSERT_SHA1(value, "A9993E364706816ABA3E25717850C26C9CD0D89D");
+    return MUNIT_OK;
+}
+
+TEST(byteSha1, abcbd, NULL, NULL, 0, NULL)
+{
+    struct byteSha1 sha1;
+    uint8_t text[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+    uint8_t value[20];
+    byteSha1Init(&sha1);
+    byteSha1Update(&sha1, text, sizeof text - 1);
+    byteSha1Digest(&sha1, value);
+    ASSERT_SHA1(value, "84983E441C3BD26EBAAE4AA1F95129E5E54670F1");
+    return MUNIT_OK;
+}
diff --git a/test/raft/unit/test_compress.c b/test/raft/unit/test_compress.c
new file mode 100644
index 000000000..0b5a2126c
--- /dev/null
+++ b/test/raft/unit/test_compress.c
@@ -0,0 +1,319 @@
+#include "../../../src/raft/byte.h"
+#include "../../../src/raft/compress.h"
+#include "../lib/munit.h"
+#include "../lib/runner.h"
+
+#include <sys/random.h>
+#ifdef LZ4_AVAILABLE
+#include <lz4frame.h>
+#endif
+
+SUITE(Compress)
+
+struct raft_buffer getBufWithRandom(size_t len)
+{
+    struct raft_buffer buf = {0};
+    buf.len = len;
+    buf.base = munit_malloc(buf.len);
+    if (len != 0) {
+        munit_assert_ptr_not_null(buf.base);
+    }
+
+    size_t offset = 0;
+    /* Write as many random ints in buf as possible */
+    for (size_t n = buf.len / sizeof(int); n > 0; n--) {
+        *((int *)(buf.base) + offset) = rand();
+        offset += 1;
+    }
+
+    /* Fill the remaining bytes */
+    size_t rem = buf.len % sizeof(int);
+    /* Offset will now be used in char* arithmetic */
+    offset *= sizeof(int);
+    if (rem) {
+        int r_int = rand();
+        for (unsigned i = 0; i < rem; i++) {
+            *((char *)buf.base + offset) = *((char *)&r_int + i);
+            offset++;
+        }
+    }
+
+    munit_assert_ulong(offset, ==, buf.len);
+    return buf;
+}
+
+struct raft_buffer getBufWithNonRandom(size_t len)
+{
+    struct raft_buffer buf = {0};
+    buf.len = len;
+    buf.base = munit_malloc(buf.len);
+    if (len != 0) {
+        munit_assert_ptr_not_null(buf.base);
+    }
+
+    memset(buf.base, 0xAC, buf.len);
+    return buf;
+}
+
+#ifdef LZ4_AVAILABLE
+
+static void sha1(struct raft_buffer bufs[], unsigned n_bufs, uint8_t value[20])
+{
+    struct byteSha1 sha;
+    byteSha1Init(&sha);
+    for (unsigned i = 0; i < n_bufs; i++) {
+        byteSha1Update(&sha, (const uint8_t *)bufs[i].base,
+                       (uint32_t)bufs[i].len);
+    }
+    byteSha1Digest(&sha, value);
+}
+
+TEST(Compress, compressDecompressZeroLength, NULL, NULL, 0, NULL)
+{
+    char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0};
+    struct raft_buffer bufs1[2] = {{NULL, 0},
+                                   {(void *)0xDEADBEEF, 0}}; /* 0 length */
+    struct raft_buffer bufs2[2] = {{(void *)0xDEADBEEF, 0},
+                                   {NULL, 0}}; /* 0 length */
+    struct raft_buffer compressed = {0};
+    munit_assert_int(Compress(&bufs1[0], 1, &compressed, errmsg), ==,
+                     RAFT_INVALID);
+    munit_assert_int(Compress(&bufs1[1], 1, &compressed, errmsg), ==,
+                     RAFT_INVALID);
+    munit_assert_int(Compress(bufs1, 2, &compressed, errmsg), ==, RAFT_INVALID);
+    munit_assert_int(Compress(bufs2, 2, &compressed, errmsg), ==, RAFT_INVALID);
+    return MUNIT_OK;
+}
+
+static char *len_one_params[] = {
+    /*    16B   1KB     64KB     4MB        128MB */
+    "16", "1024", "65536", "4194304", "134217728",
+    /*    Around Blocksize*/
+    "65516", "65517", "65518", "65521", "65535", "65537", "65551", "65555",
+    "65556",
+    /*    Ugly lengths */
+    "0", "1", "9", "123450", "1337", "6655111", NULL};
+
+static MunitParameterEnum random_one_params[] = {
+    {"len_one", len_one_params},
+    {NULL, NULL},
+};
+
+TEST(Compress, compressDecompressRandomOne, NULL, NULL, 0, random_one_params)
+{
+    char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0};
+    struct raft_buffer compressed = {0};
+    struct raft_buffer decompressed = {0};
+    uint8_t sha1_virgin[20] = {0};
+    uint8_t sha1_decompressed[20] = {1};
+
+    /* Fill a buffer with random data */
+    size_t len = strtoul(munit_parameters_get(params, "len_one"), NULL, 0);
+    if (len == 0) {
+        return MUNIT_SKIP;
+    }
+    struct raft_buffer buf = getBufWithRandom(len);
+
+    /* Assert that after compression and decompression the data is unchanged */
+    sha1(&buf, 1, sha1_virgin);
+    munit_assert_int(Compress(&buf, 1, &compressed, errmsg), ==, 0);
+    free(buf.base);
+    munit_assert_true(IsCompressed(compressed.base, compressed.len));
+    munit_assert_int(Decompress(compressed, &decompressed, errmsg), ==, 0);
+    munit_assert_ulong(decompressed.len, ==, len);
+    sha1(&decompressed, 1, sha1_decompressed);
+    munit_assert_int(memcmp(sha1_virgin, sha1_decompressed, 20), ==, 0);
+
+    raft_free(compressed.base);
+    raft_free(decompressed.base);
+    return MUNIT_OK;
+}
+
+static char *len_nonrandom_one_params[] = {
+#if !defined(__LP64__) && \
+    (defined(__arm__) || defined(__i386__) || defined(__mips__))
+    /*    4KB     64KB     4MB        1GB           INT_MAX (larger allocations
+       fail on 32-bit archs */
+    "4096", "65536", "4194304", "1073741824", "2147483647",
+#else
+    /*    4KB     64KB     4MB        1GB           2GB + 200MB */
+    "4096", "65536", "4194304", "1073741824", "2357198848",
+#endif
+    /*    Around Blocksize*/
+    "65516", "65517", "65518", "65521", "65535", "65537", "65551", "65555",
+    "65556",
+    /*    Ugly lengths */
+    "0", "993450", "31337", "83883825", NULL};
+
+static MunitParameterEnum nonrandom_one_params[] = {
+    {"len_one", len_nonrandom_one_params},
+    {NULL, NULL},
+};
+
+TEST(Compress,
+     compressDecompressNonRandomOne,
+     NULL,
+     NULL,
+     0,
+     nonrandom_one_params)
+{
+    char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0};
+    struct raft_buffer compressed = {0};
+    struct raft_buffer decompressed = {0};
+    uint8_t sha1_virgin[20] = {0};
+    uint8_t sha1_decompressed[20] = {1};
+
+    /* Fill a buffer with non-random data */
+    size_t len = strtoul(munit_parameters_get(params, "len_one"), NULL, 0);
+    if (len == 0) {
+        return MUNIT_SKIP;
+    }
+    struct raft_buffer buf = getBufWithNonRandom(len);
+
+    /* Assert that after compression and decompression the data is unchanged and
+     * that the compressed data is actually smaller */
+    sha1(&buf, 1, sha1_virgin);
+    munit_assert_int(Compress(&buf, 1, &compressed, errmsg), ==, 0);
+    free(buf.base);
+    munit_assert_true(IsCompressed(compressed.base, compressed.len));
+    if (len > 0) {
+        munit_assert_ulong(compressed.len, <, buf.len);
+    }
+    munit_assert_int(Decompress(compressed, &decompressed, errmsg), ==, 0);
+    munit_assert_ulong(decompressed.len, ==, len);
+    sha1(&decompressed, 1, sha1_decompressed);
+    munit_assert_int(memcmp(sha1_virgin, sha1_decompressed, 20), ==, 0);
+
+    raft_free(compressed.base);
+    raft_free(decompressed.base);
+    return MUNIT_OK;
+}
+
+static char *len_two_params[] = {"4194304", "13373", "66", "0", NULL};
+
+static MunitParameterEnum random_two_params[] = {
+    {"len_one", len_one_params},
+    {"len_two", len_two_params},
+    {NULL, NULL},
+};
+
+TEST(Compress, compressDecompressRandomTwo, NULL, NULL, 0, random_two_params)
+{
+    char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0};
+    struct raft_buffer compressed = {0};
+    struct raft_buffer decompressed = {0};
+    uint8_t sha1_virgin[20] = {0};
+    uint8_t sha1_single[20] = {0};
+    uint8_t sha1_decompressed[20] = {1};
+
+    /* Fill two buffers with random data */
+    size_t len1 = strtoul(munit_parameters_get(params, "len_one"), NULL, 0);
+    size_t len2 = strtoul(munit_parameters_get(params, "len_two"), NULL, 0);
+    if (len1 + len2 == 0) {
+        return MUNIT_SKIP;
+    }
+    struct raft_buffer buf1 = getBufWithRandom(len1);
+    struct raft_buffer buf2 = getBufWithRandom(len2);
+    struct raft_buffer bufs[2] = {buf1, buf2};
+
+    /* If one of the buffers is empty ensure data is identical to single buffer
+     * case. */
+    if (len1 == 0) {
+        sha1(&buf2, 1, sha1_single);
+    } else if (len2 == 0) {
+        sha1(&buf1, 1, sha1_single);
+    }
+
+    /* Assert that after compression and decompression the data is unchanged */
+    sha1(bufs, 2, sha1_virgin);
+    munit_assert_int(Compress(bufs, 2, &compressed, errmsg), ==, 0);
+    free(buf1.base);
+    free(buf2.base);
+    munit_assert_true(IsCompressed(compressed.base, compressed.len));
+    munit_assert_int(Decompress(compressed, &decompressed, errmsg), ==, 0);
+    munit_assert_ulong(decompressed.len, ==, buf1.len + buf2.len);
+    sha1(&decompressed, 1, sha1_decompressed);
+    munit_assert_int(memcmp(sha1_virgin, sha1_decompressed, 20), ==, 0);
+
+    if (len1 == 0 || len2 == 0) {
+        munit_assert_int(memcmp(sha1_single, sha1_virgin, 20), ==, 0);
+        munit_assert_int(memcmp(sha1_single, sha1_decompressed, 20), ==, 0);
+    }
+
+    raft_free(compressed.base);
+    raft_free(decompressed.base);
+    return MUNIT_OK;
+}
+
+TEST(Compress, compressDecompressCorruption, NULL, NULL, 0, NULL)
+{
+    char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0};
+    struct raft_buffer compressed = {0};
+    struct raft_buffer decompressed = {0};
+
+    /* Fill a buffer with random data */
+    size_t len = 2048;
+    struct raft_buffer buf = getBufWithRandom(len);
+
+    munit_assert_int(Compress(&buf, 1, &compressed, errmsg), ==, 0);
+    munit_assert_true(IsCompressed(compressed.base, compressed.len));
+
+    /* Corrupt the a data byte after the header */
+    munit_assert_ulong(LZ4F_HEADER_SIZE_MAX_RAFT, <, compressed.len);
+    ((char *)compressed.base)[LZ4F_HEADER_SIZE_MAX_RAFT] += 1;
+
+    munit_assert_int(Decompress(compressed, &decompressed, errmsg), !=, 0);
+    munit_assert_string_equal(errmsg,
+                              "LZ4F_decompress ERROR_contentChecksum_invalid");
+    munit_assert_ptr_null(decompressed.base);
+
+    raft_free(compressed.base);
+    free(buf.base);
+    return MUNIT_OK;
+}
+
+#else
+
+TEST(Compress, lz4Disabled, NULL, NULL, 0, NULL)
+{
+    char errmsg[RAFT_ERRMSG_BUF_SIZE] = {0};
+    struct raft_buffer compressed = {0};
+
+    /* Fill a buffer with random data */
+    size_t len = 2048;
+    struct raft_buffer buf = getBufWithRandom(len);
+
+    munit_assert_int(Compress(&buf, 1, &compressed, errmsg), ==, RAFT_INVALID);
+    munit_assert_ptr_null(compressed.base);
+
+    free(buf.base);
+    return MUNIT_OK;
+}
+
+#endif /* LZ4_AVAILABLE */
+
+static const char LZ4_MAGIC[4] = {0x04, 0x22, 0x4d, 0x18};
+TEST(Compress, isCompressedTooSmall, NULL, NULL, 0, NULL)
+{
+    munit_assert_false(IsCompressed(&LZ4_MAGIC[1], sizeof(LZ4_MAGIC) - 1));
+    return MUNIT_OK;
+}
+
+TEST(Compress, isCompressedNull, NULL, NULL, 0, NULL)
+{
+    munit_assert_false(IsCompressed(NULL, sizeof(LZ4_MAGIC)));
+    return MUNIT_OK;
+}
+
+TEST(Compress, isCompressed, NULL, NULL, 0, NULL)
+{
+    munit_assert_true(IsCompressed(LZ4_MAGIC, sizeof(LZ4_MAGIC)));
+    return MUNIT_OK;
+}
+
+TEST(Compress, notCompressed, NULL, NULL, 0, NULL)
+{
+    char not_compressed[4] = {0x18, 0x4d, 0x22, 0x04};
+    munit_assert_false(IsCompressed(not_compressed, sizeof(not_compressed)));
+    return MUNIT_OK;
+}
diff --git a/test/raft/unit/test_configuration.c b/test/raft/unit/test_configuration.c
new file mode 100644
index 000000000..91f6d9792
--- /dev/null
+++ b/test/raft/unit/test_configuration.c
@@ -0,0 +1,638 @@
+#include "../../../src/raft/byte.h"
+#include "../../../src/raft/configuration.h"
+#include "../lib/heap.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_HEAP;
+    struct raft_configuration configuration;
+};
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SET_UP_HEAP;
+    configurationInit(&f->configuration);
+    return f;
+}
+
+static void tearDownNoClose(void *data)
+{
+    struct fixture *f = data;
+    TEAR_DOWN_HEAP;
+    free(f);
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    configurationClose(&f->configuration);
+    tearDownNoClose(data);
+}
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+/* Accessors */
+#define VOTER_COUNT configurationVoterCount(&f->configuration)
+#define INDEX_OF(ID) configurationIndexOf(&f->configuration, ID)
+#define INDEX_OF_VOTER(ID) configurationIndexOfVoter(&f->configuration, ID)
+#define GET(ID) configurationGet(&f->configuration, ID)
+
+/* Add a server to the fixture's configuration. */
+#define ADD_RV(ID, ADDRESS, ROLE) \
+    configurationAdd(&f->configuration, ID, ADDRESS, ROLE)
+#define ADD(...) munit_assert_int(ADD_RV(__VA_ARGS__), ==, 0)
+#define ADD_ERROR(RV, ...) munit_assert_int(ADD_RV(__VA_ARGS__), ==, RV)
+
+/* Remove a server from the fixture's configuration */
+#define REMOVE_RV(ID) configurationRemove(&f->configuration, ID)
+#define REMOVE(...) munit_assert_int(REMOVE_RV(__VA_ARGS__), ==, 0)
+#define REMOVE_ERROR(RV, ...) munit_assert_int(REMOVE_RV(__VA_ARGS__), ==, RV)
+
+/* Copy the fixture's configuration into the given one. */
+#define COPY_RV(CONF) configurationCopy(&f->configuration, CONF)
+#define COPY(...) munit_assert_int(COPY_RV(__VA_ARGS__), ==, 0)
+#define COPY_ERROR(RV, ...) munit_assert_int(COPY_RV(__VA_ARGS__), ==, RV)
+
+/* Encode the fixture's configuration into the given buffer. */
+#define ENCODE_RV(BUF) configurationEncode(&f->configuration, BUF)
+#define ENCODE(...) munit_assert_int(ENCODE_RV(__VA_ARGS__), ==, 0)
+#define ENCODE_ERROR(RV, ...) munit_assert_int(ENCODE_RV(__VA_ARGS__), ==, RV)
+
+/* Decode the given buffer into the fixture's configuration. */
+#define DECODE_RV(BUF) configurationDecode(BUF, &f->configuration)
+#define DECODE(...) munit_assert_int(DECODE_RV(__VA_ARGS__), ==, 0)
+#define DECODE_ERROR(RV, ...) munit_assert_int(DECODE_RV(__VA_ARGS__), ==, RV)
+
+/******************************************************************************
+ *
+ * Assertions
+ *
+ *****************************************************************************/
+
+/* Assert that the fixture's configuration has n servers. */
+#define ASSERT_N(N)                                              \
+    {                                                            \
+        munit_assert_int(f->configuration.n, ==, N);             \
+        if (N == 0) {                                            \
+            munit_assert_ptr_null(f->configuration.servers);     \
+        } else {                                                 \
+            munit_assert_ptr_not_null(f->configuration.servers); \
+        }                                                        \
+    }
+
+/* Assert that the attributes of the I'th server in the fixture's configuration
+ * match the given values. */
+#define ASSERT_SERVER(I, ID, ADDRESS, ROLE)                  \
+    {                                                        \
+        struct raft_server *server;                          \
+        munit_assert_int(I, <, f->configuration.n);          \
+        server = &f->configuration.servers[I];               \
+        munit_assert_int(server->id, ==, ID);                \
+        munit_assert_string_equal(server->address, ADDRESS); \
+        munit_assert_int(server->role, ==, ROLE);            \
+    }
+
+/******************************************************************************
+ *
+ * configurationVoterCount
+ *
+ *****************************************************************************/
+
+SUITE(configurationVoterCount)
+
+/* All servers are voting. */
+TEST(configurationVoterCount, all_voters, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "192.168.1.1:666", RAFT_VOTER);
+    ADD(2, "192.168.1.2:666", RAFT_VOTER);
+    munit_assert_int(VOTER_COUNT, ==, 2);
+    return MUNIT_OK;
+}
+
+/* Return only voting servers. */
+TEST(configurationVoterCount, filter, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "192.168.1.1:666", RAFT_VOTER);
+    ADD(2, "192.168.1.2:666", RAFT_STANDBY);
+    munit_assert_int(VOTER_COUNT, ==, 1);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * configurationIndexOf
+ *
+ *****************************************************************************/
+
+SUITE(configurationIndexOf)
+
+/* If a matching server is found, it's index is returned. */
+TEST(configurationIndexOf, match, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "192.168.1.1:666", RAFT_VOTER);
+    ADD(2, "192.168.1.2:666", RAFT_STANDBY);
+    munit_assert_int(INDEX_OF(2), ==, 1);
+    return MUNIT_OK;
+}
+
+/* If no matching server is found, the length of the configuration is
+ * returned. */
+TEST(configurationIndexOf, no_match, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "127.0.0.1:666", RAFT_VOTER);
+    munit_assert_int(INDEX_OF(3), ==, f->configuration.n);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * configurationIndexOfVoter
+ *
+ *****************************************************************************/
+
+SUITE(configurationIndexOfVoter)
+
+/* The index of the matching voting server (relative to the number of voting
+   servers) is returned. */
+TEST(configurationIndexOfVoter, match, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "192.168.1.1:666", RAFT_STANDBY);
+    ADD(2, "192.168.1.2:666", RAFT_VOTER);
+    ADD(3, "192.168.1.3:666", RAFT_VOTER);
+    munit_assert_int(INDEX_OF_VOTER(3), ==, 1);
+    return MUNIT_OK;
+}
+
+/* If no matching server is found, the length of the configuration is
+ * returned. */
+TEST(configurationIndexOfVoter, no_match, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "192.168.1.1:666", RAFT_VOTER);
+    munit_assert_int(INDEX_OF_VOTER(3), ==, 1);
+    return MUNIT_OK;
+}
+
+/* If the server exists but is non-voting, the length of the configuration is
+ * returned. */
+TEST(configurationIndexOfVoter, non_voting, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "192.168.1.1:666", RAFT_STANDBY);
+    munit_assert_int(INDEX_OF_VOTER(1), ==, 1);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * configurationGet
+ *
+ *****************************************************************************/
+
+SUITE(configurationGet)
+
+/* If a matching server is found, it's returned. */
+TEST(configurationGet, match, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    const struct raft_server *server;
+    ADD(1, "192.168.1.1:666", RAFT_VOTER);
+    ADD(2, "192.168.1.2:666", RAFT_STANDBY);
+    server = GET(2);
+    munit_assert_ptr_not_null(server);
+    munit_assert_int(server->id, ==, 2);
+    munit_assert_string_equal(server->address, "192.168.1.2:666");
+    return MUNIT_OK;
+}
+
+/* If no matching server is found, NULL is returned. */
+TEST(configurationGet, no_match, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "127.0.0.1:666", RAFT_VOTER);
+    munit_assert_ptr_null(GET(3));
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * configurationCopy
+ *
+ *****************************************************************************/
+
+SUITE(configurationCopy)
+
+/* Copy a configuration containing two servers */
+TEST(configurationCopy, two, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_configuration configuration;
+    ADD(1, "192.168.1.1:666", RAFT_STANDBY);
+    ADD(2, "192.168.1.2:666", RAFT_VOTER);
+    COPY(&configuration);
+    munit_assert_int(configuration.n, ==, 2);
+    munit_assert_int(configuration.servers[0].id, ==, 1);
+    munit_assert_int(configuration.servers[1].id, ==, 2);
+    configurationClose(&configuration);
+    return MUNIT_OK;
+}
+
+static char *copy_oom_heap_fault_delay[] = {"0", "1", "2", NULL};
+static char *copy_oom_heap_fault_repeat[] = {"1", NULL};
+
+static MunitParameterEnum copy_oom_params[] = {
+    {TEST_HEAP_FAULT_DELAY, copy_oom_heap_fault_delay},
+    {TEST_HEAP_FAULT_REPEAT, copy_oom_heap_fault_repeat},
+    {NULL, NULL},
+};
+
+/* Out of memory */
+TEST(configurationCopy, oom, setUp, tearDown, 0, copy_oom_params)
+{
+    struct fixture *f = data;
+    struct raft_configuration configuration;
+    ADD(1, "192.168.1.1:666", RAFT_STANDBY);
+    ADD(2, "192.168.1.2:666", RAFT_VOTER);
+    HEAP_FAULT_ENABLE;
+    COPY_ERROR(RAFT_NOMEM, &configuration);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * raft_configuration_add
+ *
+ *****************************************************************************/
+
+SUITE(configurationAdd)
+
+/* Add a server to the configuration. */
+TEST(configurationAdd, one, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "127.0.0.1:666", RAFT_VOTER);
+    ASSERT_N(1);
+    ASSERT_SERVER(0, 1, "127.0.0.1:666", RAFT_VOTER);
+    return MUNIT_OK;
+}
+
+/* Add two servers to the configuration. */
+TEST(configurationAdd, two, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "127.0.0.1:666", RAFT_VOTER);
+    ADD(2, "192.168.1.1:666", RAFT_STANDBY);
+    ASSERT_N(2);
+    ASSERT_SERVER(0, 1, "127.0.0.1:666", RAFT_VOTER);
+    ASSERT_SERVER(1, 2, "192.168.1.1:666", RAFT_STANDBY);
+    return MUNIT_OK;
+}
+
+/* Add a server with an ID which is already in use. */
+TEST(configurationAdd, duplicateId, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "127.0.0.1:666", RAFT_VOTER);
+    ADD_ERROR(RAFT_DUPLICATEID, 1, "192.168.1.1:666", RAFT_STANDBY);
+    return MUNIT_OK;
+}
+
+/* Add a server with an address which is already in use. */
+TEST(configurationAdd, duplicateAddress, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "127.0.0.1:666", RAFT_VOTER);
+    ADD_ERROR(RAFT_DUPLICATEADDRESS, 2, "127.0.0.1:666", RAFT_STANDBY);
+    return MUNIT_OK;
+}
+
+/* Add a server with an invalid role. */
+TEST(configurationAdd, invalidRole, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD_ERROR(RAFT_BADROLE, 2, "127.0.0.1:666", 666);
+    return MUNIT_OK;
+}
+
+static char *add_oom_heap_fault_delay[] = {"0", "1", NULL};
+static char *add_oom_heap_fault_repeat[] = {"1", NULL};
+
+static MunitParameterEnum add_oom_params[] = {
+    {TEST_HEAP_FAULT_DELAY, add_oom_heap_fault_delay},
+    {TEST_HEAP_FAULT_REPEAT, add_oom_heap_fault_repeat},
+    {NULL, NULL},
+};
+
+/* Out of memory. */
+TEST(configurationAdd, oom, setUp, tearDown, 0, add_oom_params)
+{
+    struct fixture *f = data;
+    HeapFaultEnable(&f->heap);
+    ADD_ERROR(RAFT_NOMEM, 1, "127.0.0.1:666", RAFT_VOTER);
+    munit_assert_null(f->configuration.servers);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * configurationRemove
+ *
+ *****************************************************************************/
+
+SUITE(configurationRemove)
+
+/* Remove the last and only server. */
+TEST(configurationRemove, last, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "127.0.0.1:666", RAFT_VOTER);
+    REMOVE(1);
+    ASSERT_N(0);
+    return MUNIT_OK;
+}
+
+/* Remove the first server. */
+TEST(configurationRemove, first, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "127.0.0.1:666", RAFT_VOTER);
+    ADD(2, "192.168.1.1:666", RAFT_STANDBY);
+    REMOVE(1);
+    ASSERT_N(1);
+    ASSERT_SERVER(0, 2, "192.168.1.1:666", RAFT_STANDBY);
+    return MUNIT_OK;
+}
+
+/* Remove a server in the middle. */
+TEST(configurationRemove, middle, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "127.0.0.1:666", RAFT_VOTER);
+    ADD(2, "192.168.1.1:666", RAFT_STANDBY);
+    ADD(3, "10.0.1.1:666", RAFT_VOTER);
+    REMOVE(2);
+    ASSERT_N(2);
+    ASSERT_SERVER(0, 1, "127.0.0.1:666", RAFT_VOTER);
+    ASSERT_SERVER(1, 3, "10.0.1.1:666", RAFT_VOTER);
+    return MUNIT_OK;
+}
+
+/* Attempts to remove a server with an unknown ID result in an error. */
+TEST(configurationRemove, unknownId, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    REMOVE_ERROR(RAFT_BADID, 1);
+    return MUNIT_OK;
+}
+
+/* Out of memory. */
+TEST(configurationRemove, oom, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ADD(1, "127.0.0.1:666", RAFT_VOTER);
+    ADD(2, "192.168.1.1:666", RAFT_STANDBY);
+    HeapFaultConfig(&f->heap, 0, 1);
+    HeapFaultEnable(&f->heap);
+    REMOVE_ERROR(RAFT_NOMEM, 1);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * configurationEncode
+ *
+ *****************************************************************************/
+
+SUITE(configurationEncode)
+
+/* Encode a configuration with one server. */
+TEST(configurationEncode, one_server, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_buffer buf;
+    size_t len;
+    const void *cursor;
+    const char *address = "127.0.0.1:666";
+    ADD(1, address, RAFT_VOTER);
+    ENCODE(&buf);
+
+    len = 1 + 8 +                  /* Version and n of servers */
+          8 + strlen(address) + 1; /* Server */
+    len = bytePad64(len);
+
+    munit_assert_int(buf.len, ==, len);
+
+    cursor = buf.base;
+
+    munit_assert_int(byteGet8(&cursor), ==, 1);
+    munit_assert_int(byteGet64(&cursor), ==, 1);
+
+    munit_assert_int(byteGet64(&cursor), ==, 1);
+    munit_assert_string_equal(byteGetString(&cursor, strlen(address) + 1),
+                              address);
+    munit_assert_int(byteGet8(&cursor), ==, RAFT_VOTER);
+
+    raft_free(buf.base);
+
+    return MUNIT_OK;
+}
+
+/* Encode a configuration with two servers. */
+TEST(configurationEncode, two_servers, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_buffer buf;
+    size_t len;
+    const void *cursor;
+    const char *address1 = "127.0.0.1:666";
+    const char *address2 = "192.168.1.1:666";
+
+    ADD(1, address1, RAFT_STANDBY);
+    ADD(2, address2, RAFT_VOTER);
+    ENCODE(&buf);
+
+    len = 1 + 8 +                        /* Version and n of servers */
+          8 + strlen(address1) + 1 + 1 + /* Server 1 */
+          8 + strlen(address2) + 1 + 1;  /* Server 2 */
+    len = bytePad64(len);
+
+    munit_assert_int(buf.len, ==, len);
+
+    cursor = buf.base;
+
+    munit_assert_int(byteGet8(&cursor), ==, 1);
+    munit_assert_int(byteGet64(&cursor), ==, 2);
+
+    munit_assert_int(byteGet64(&cursor), ==, 1);
+    munit_assert_string_equal(byteGetString(&cursor, strlen(address1) + 1),
+                              address1);
+    munit_assert_int(byteGet8(&cursor), ==, RAFT_STANDBY);
+
+    munit_assert_int(byteGet64(&cursor), ==, 2);
+    munit_assert_string_equal(byteGetString(&cursor, strlen(address2) + 1),
+                              address2);
+    munit_assert_int(byteGet8(&cursor), ==, RAFT_VOTER);
+
+    raft_free(buf.base);
+
+    return MUNIT_OK;
+}
+
+/* Out of memory. */
+TEST(configurationEncode, oom, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_buffer buf;
+    HeapFaultConfig(&f->heap, 2, 1);
+    HeapFaultEnable(&f->heap);
+    ADD(1, "127.0.0.1:666", RAFT_VOTER);
+    ENCODE_ERROR(RAFT_NOMEM, &buf);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * configurationDecode
+ *
+ *****************************************************************************/
+
+SUITE(configurationDecode)
+
+/* The decode a payload encoding a configuration with one server */
+TEST(configurationDecode, one_server, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t bytes[] = {1,                            /* Version */
+                       1,   0,   0,   0, 0, 0, 0, 0, /* Number of servers */
+                       5,   0,   0,   0, 0, 0, 0, 0, /* Server ID */
+                       'x', '.', 'y', 0,             /* Server address */
+                       1};                           /* Role code */
+    struct raft_buffer buf;
+
+    buf.base = bytes;
+    buf.len = sizeof bytes;
+
+    DECODE(&buf);
+
+    ASSERT_N(1);
+    ASSERT_SERVER(0, 5, "x.y", RAFT_VOTER);
+
+    return MUNIT_OK;
+}
+
+/* The decode size is the size of a raft_server array plus the length of the
+ * addresses. */
+TEST(configurationDecode, two_servers, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t bytes[] = {1,                                /* Version */
+                       2,   0,   0,   0,   0,   0, 0, 0, /* Number of servers */
+                       5,   0,   0,   0,   0,   0, 0, 0, /* Server ID */
+                       'x', '.', 'y', 0,                 /* Server address */
+                       1,                                /* Role code */
+                       3,   0,   0,   0,   0,   0, 0, 0, /* Server ID */
+                       '1', '9', '2', '.', '2', 0,       /* Server address */
+                       0};                               /* Role code */
+    struct raft_buffer buf;
+    buf.base = bytes;
+    buf.len = sizeof bytes;
+    DECODE(&buf);
+    ASSERT_N(2);
+    ASSERT_SERVER(0, 5, "x.y", RAFT_VOTER);
+    ASSERT_SERVER(1, 3, "192.2", RAFT_STANDBY);
+    return MUNIT_OK;
+}
+
+static char *decode_oom_heap_fault_delay[] = {"0", "1", "2", "3", NULL};
+static char *decode_oom_heap_fault_repeat[] = {"1", NULL};
+
+static MunitParameterEnum decode_oom_params[] = {
+    {TEST_HEAP_FAULT_DELAY, decode_oom_heap_fault_delay},
+    {TEST_HEAP_FAULT_REPEAT, decode_oom_heap_fault_repeat},
+    {NULL, NULL},
+};
+
+/* Not enough memory for creating the decoded configuration object. */
+TEST(configurationDecode, oom, setUp, tearDownNoClose, 0, decode_oom_params)
+{
+    struct fixture *f = data;
+    uint8_t bytes[] = {1,                            /* Version */
+                       2,   0,   0,   0, 0, 0, 0, 0, /* Number of servers */
+                       5,   0,   0,   0, 0, 0, 0, 0, /* Server ID */
+                       'x', '.', 'y', 0,             /* Server address */
+                       1,                            /* Role code */
+                       3,   0,   0,   0, 0, 0, 0, 0, /* Server ID */
+                       'z', '.', 'w', 0,             /* Server address */
+                       0};                           /* Role code */
+    struct raft_buffer buf;
+    HEAP_FAULT_ENABLE;
+    buf.base = bytes;
+    buf.len = sizeof bytes;
+    DECODE_ERROR(RAFT_NOMEM, &buf);
+    return MUNIT_OK;
+}
+
+/* If the encoding version is wrong, an error is returned. */
+TEST(configurationDecode, badVersion, setUp, tearDownNoClose, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t bytes = 127;
+    struct raft_buffer buf;
+    buf.base = &bytes;
+    buf.len = 1;
+    DECODE_ERROR(RAFT_MALFORMED, &buf);
+    return MUNIT_OK;
+}
+
+/* The address of a server is not a nul-terminated string. */
+TEST(configurationDecode, badAddress, setUp, tearDownNoClose, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t bytes[] = {1,                            /* Version */
+                       1,   0,   0,   0, 0, 0, 0, 0, /* Number of servers */
+                       5,   0,   0,   0, 0, 0, 0, 0, /* Server ID */
+                       'x', '.', 'y',                /* Server address */
+                       1};                           /* Voting flag */
+    struct raft_buffer buf;
+    buf.base = bytes;
+    buf.len = sizeof bytes;
+    DECODE_ERROR(RAFT_MALFORMED, &buf);
+    return MUNIT_OK;
+}
+
+/* The encoded configuration is invalid because it has a duplicated server
+ * ID. In that case RAFT_MALFORMED is returned. */
+TEST(configurationDecode, duplicatedID, setUp, tearDownNoClose, 0, NULL)
+{
+    struct fixture *f = data;
+    uint8_t bytes[] = {1,                            /* Version */
+                       2,   0,   0,   0, 0, 0, 0, 0, /* Number of servers */
+                       5,   0,   0,   0, 0, 0, 0, 0, /* Server ID */
+                       'x', '.', 'y', 0,             /* Server address */
+                       1,                            /* Role code */
+                       5,   0,   0,   0, 0, 0, 0, 0, /* Server ID */
+                       'z', '.', 'w', 0,             /* Server address */
+                       0};                           /* Role code */
+    struct raft_buffer buf;
+    buf.base = bytes;
+    buf.len = sizeof bytes;
+    DECODE_ERROR(RAFT_MALFORMED, &buf);
+    return MUNIT_OK;
+}
diff --git a/test/raft/unit/test_err.c b/test/raft/unit/test_err.c
new file mode 100644
index 000000000..95f88ea28
--- /dev/null
+++ b/test/raft/unit/test_err.c
@@ -0,0 +1,87 @@
+#include <errno.h>
+#include <stdio.h>
+
+#include "../../../src/raft/err.h"
+#include "../lib/heap.h"
+#include "../lib/runner.h"
+
+/* An error messages which is 249 characters. */
+#define LONG_ERRMSG                                                          \
+    "boom boom boom boom boom boom boom boom boom boom boom boom boom boom " \
+    "boom boom boom boom boom boom boom boom boom boom boom boom boom boom " \
+    "boom boom boom boom boom boom boom boom boom boom boom boom boom boom " \
+    "boom boom boom boom boom boom boom boom"
+
+/******************************************************************************
+ *
+ * ErrMsgPrintf
+ *
+ *****************************************************************************/
+
+SUITE(ErrMsgPrintf)
+
+/* The format string has no parameters. */
+TEST(ErrMsgPrintf, noParams, NULL, NULL, 0, NULL)
+{
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    ErrMsgPrintf(errmsg, "boom");
+    munit_assert_string_equal(errmsg, "boom");
+    return MUNIT_OK;
+}
+
+/* The format string has parameters. */
+TEST(ErrMsgPrintf, params, NULL, NULL, 0, NULL)
+{
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    ErrMsgPrintf(errmsg, "boom %d", 123);
+    munit_assert_string_equal(errmsg, "boom 123");
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * ErrMsgWrapf
+ *
+ *****************************************************************************/
+
+SUITE(ErrMsgWrapf)
+
+/* The wrapping format string has no parameters. */
+TEST(ErrMsgWrapf, noParams, NULL, NULL, 0, NULL)
+{
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    ErrMsgPrintf(errmsg, "boom");
+    ErrMsgWrapf(errmsg, "no luck");
+    munit_assert_string_equal(errmsg, "no luck: boom");
+    return MUNIT_OK;
+}
+
+/* The wrapping format string has parameters. */
+TEST(ErrMsgWrapf, params, NULL, NULL, 0, NULL)
+{
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    ErrMsgPrintf(errmsg, "boom");
+    ErrMsgWrapf(errmsg, "no luck, %s", "joe");
+    munit_assert_string_equal(errmsg, "no luck, joe: boom");
+    return MUNIT_OK;
+}
+
+/* The wrapped error message gets partially truncated. */
+TEST(ErrMsgWrapf, partialTruncate, NULL, NULL, 0, NULL)
+{
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    ErrMsgPrintf(errmsg, "no luck");
+    ErrMsgWrapf(errmsg, LONG_ERRMSG);
+    munit_assert_string_equal(errmsg, LONG_ERRMSG ": no l");
+    return MUNIT_OK;
+}
+
+/* The wrapped error message gets entirely truncated. */
+TEST(ErrMsgWrapf, fullTruncate, NULL, NULL, 0, NULL)
+{
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    ErrMsgPrintf(errmsg, "no luck");
+    ErrMsgWrapf(errmsg, LONG_ERRMSG " boom");
+    munit_assert_string_equal(errmsg, LONG_ERRMSG " boom");
+    return MUNIT_OK;
+}
diff --git a/test/raft/unit/test_flags.c b/test/raft/unit/test_flags.c
new file mode 100644
index 000000000..7fbbe26db
--- /dev/null
+++ b/test/raft/unit/test_flags.c
@@ -0,0 +1,97 @@
+#include "../../../src/raft/flags.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * flags
+ *
+ *****************************************************************************/
+
+SUITE(flags)
+
+TEST(flags, empty, NULL, NULL, 0, NULL)
+{
+    raft_flags flags = 0;
+    for (int i = 0; i < 64; i++) {
+        munit_assert_false(flagsIsSet(flags, ((raft_flags)1) << i));
+    }
+    return MUNIT_OK;
+}
+
+TEST(flags, setClear, NULL, NULL, 0, NULL)
+{
+    raft_flags flags = 0;
+    raft_flags flag = 0;
+    for (int i = 0; i < 64; i++) {
+        flag = ((raft_flags)1) << i;
+        flags = flagsSet(flags, flag);
+        munit_assert_true(flagsIsSet(flags, flag));
+        flags = flagsClear(flags, flag);
+        munit_assert_false(flagsIsSet(flags, flag));
+        munit_assert_true(flags == 0);
+    }
+    return MUNIT_OK;
+}
+
+TEST(flags, setMultipleClearMultiple, NULL, NULL, 0, NULL)
+{
+    raft_flags in = 0;
+    raft_flags out;
+    raft_flags flags = (raft_flags)(1 | 1 << 4 | 1 << 13 | (raft_flags)1 << 40 |
+                                    (raft_flags)1 << 63);
+    out = flagsSet(in, flags);
+    /* clang-format off */
+    int positions[64] = {
+        1, 0, 0, 0, 1, 0, 0, 0, // 0th and 4th
+        0, 0, 0, 0, 0, 1, 0, 0, // 13th
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        1, 0, 0, 0, 0, 0, 0, 0, // 40th
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 1, // 63th
+    };
+    /* clang-format on */
+    for (unsigned i = 0; i < 64; i++) {
+        if (positions[i]) {
+            munit_assert_true(flagsIsSet(out, (raft_flags)1 << i));
+        } else {
+            munit_assert_false(flagsIsSet(out, (raft_flags)1 << i));
+        }
+    }
+    out = flagsClear(out, flags);
+    munit_assert_true(out == 0);
+    return MUNIT_OK;
+}
+
+TEST(flags, setMultipleClearSingle, NULL, NULL, 0, NULL)
+{
+    raft_flags in = 0;
+    raft_flags out;
+    raft_flags flags = (raft_flags)(1 << 3 | 1 << 5 | 1 << 18 |
+                                    (raft_flags)1 << 32 | (raft_flags)1 << 35);
+    out = flagsSet(in, flags);
+    /* clang-format off */
+    int positions[64] = {
+        0, 0, 0, 1, 0, 1, 0, 0, // 3rd and 5th
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 1, 0, 0, 0, 0, 0, // 18th
+        0, 0, 0, 0, 0, 0, 0, 0,
+        1, 0, 0, 1, 0, 0, 0, 0, // 32rd 35th
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0,
+    };
+    /* clang-format on */
+    for (unsigned i = 0; i < 64; i++) {
+        if (positions[i]) {
+            munit_assert_true(flagsIsSet(out, (raft_flags)1 << i));
+        } else {
+            munit_assert_false(flagsIsSet(out, (raft_flags)1 << i));
+        }
+    }
+    out = flagsClear(out, (raft_flags)1 << 32);
+    munit_assert_true(
+        out == (raft_flags)(1 << 3 | 1 << 5 | 1 << 18 | (raft_flags)1 << 35));
+    return MUNIT_OK;
+}
diff --git a/test/raft/unit/test_log.c b/test/raft/unit/test_log.c
new file mode 100644
index 000000000..0820580aa
--- /dev/null
+++ b/test/raft/unit/test_log.c
@@ -0,0 +1,1237 @@
+#include "../../../src/raft/configuration.h"
+#include "../../../src/raft/log.h"
+#include "../lib/heap.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_HEAP;
+    struct raft_log *log;
+};
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+/* Accessors */
+#define NUM_ENTRIES logNumEntries(f->log)
+#define LAST_INDEX logLastIndex(f->log)
+#define TERM_OF(INDEX) logTermOf(f->log, INDEX)
+#define LAST_TERM logLastTerm(f->log)
+#define GET(INDEX) logGet(f->log, INDEX)
+
+/* Append one command entry with the given term and a hard-coded payload. */
+#define APPEND(TERM)                                              \
+    {                                                             \
+        struct raft_buffer buf_;                                  \
+        int rv_;                                                  \
+        buf_.base = raft_malloc(8);                               \
+        buf_.len = 8;                                             \
+        strcpy(buf_.base, "hello");                               \
+        rv_ = logAppend(f->log, TERM, RAFT_COMMAND, &buf_, NULL); \
+        munit_assert_int(rv_, ==, 0);                             \
+    }
+
+/* Same as APPEND, but repeated N times. */
+#define APPEND_MANY(TERM, N)         \
+    {                                \
+        int i_;                      \
+        for (i_ = 0; i_ < N; i_++) { \
+            APPEND(TERM);            \
+        }                            \
+    }
+
+/* Invoke append and assert that it returns the given error. */
+#define APPEND_ERROR(TERM, RV)                                    \
+    {                                                             \
+        struct raft_buffer buf_;                                  \
+        int rv_;                                                  \
+        buf_.base = raft_malloc(8);                               \
+        buf_.len = 8;                                             \
+        rv_ = logAppend(f->log, TERM, RAFT_COMMAND, &buf_, NULL); \
+        munit_assert_int(rv_, ==, RV);                            \
+        raft_free(buf_.base);                                     \
+    }
+
+/* Append N entries all belonging to the same batch. Each entry will have 64-bit
+ * payload set to i * 1000, where i is the index of the entry in the batch. */
+#define APPEND_BATCH(N)                                           \
+    {                                                             \
+        void *batch;                                              \
+        size_t offset;                                            \
+        int i;                                                    \
+        batch = raft_malloc(8 * N);                               \
+        munit_assert_ptr_not_null(batch);                         \
+        offset = 0;                                               \
+        for (i = 0; i < N; i++) {                                 \
+            struct raft_buffer buf;                               \
+            int rv;                                               \
+            buf.base = (uint8_t *)batch + offset;                 \
+            buf.len = 8;                                          \
+            *(uint64_t *)buf.base = i * 1000;                     \
+            rv = logAppend(f->log, 1, RAFT_COMMAND, &buf, batch); \
+            munit_assert_int(rv, ==, 0);                          \
+            offset += 8;                                          \
+        }                                                         \
+    }
+
+#define ACQUIRE(INDEX)                                 \
+    {                                                  \
+        int rv2;                                       \
+        rv2 = logAcquire(f->log, INDEX, &entries, &n); \
+        munit_assert_int(rv2, ==, 0);                  \
+    }
+
+#define RELEASE(INDEX) logRelease(f->log, INDEX, entries, n);
+
+#define TRUNCATE(N) logTruncate(f->log, N)
+#define SNAPSHOT(INDEX, TRAILING) logSnapshot(f->log, INDEX, TRAILING)
+#define RESTORE(INDEX, TERM) logRestore(f->log, INDEX, TERM)
+
+/******************************************************************************
+ *
+ * Set up an empty configuration.
+ *
+ *****************************************************************************/
+
+static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    SET_UP_HEAP;
+    f->log = logInit();
+    if (f->log == NULL) {
+        munit_assert_true(false);
+    }
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    logClose(f->log);
+    TEAR_DOWN_HEAP;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Assertions
+ *
+ *****************************************************************************/
+
+/* Assert the state of the fixture's log in terms of size, front/back indexes,
+ * offset and number of entries. */
+#define ASSERT(SIZE, FRONT, BACK, OFFSET, N)      \
+    munit_assert_int(f->log->size, ==, SIZE);     \
+    munit_assert_int(f->log->front, ==, FRONT);   \
+    munit_assert_int(f->log->back, ==, BACK);     \
+    munit_assert_int(f->log->offset, ==, OFFSET); \
+    munit_assert_int(logNumEntries(f->log), ==, N)
+
+/* Assert the last index and term of the most recent snapshot. */
+#define ASSERT_SNAPSHOT(INDEX, TERM)                          \
+    munit_assert_int(f->log->snapshot.last_index, ==, INDEX); \
+    munit_assert_int(f->log->snapshot.last_term, ==, TERM)
+
+/* Assert that the term of entry at INDEX equals TERM. */
+#define ASSERT_TERM_OF(INDEX, TERM)              \
+    {                                            \
+        const struct raft_entry *entry;          \
+        entry = logGet(f->log, INDEX);           \
+        munit_assert_ptr_not_null(entry);        \
+        munit_assert_int(entry->term, ==, TERM); \
+    }
+
+/* Assert that the number of outstanding references for the entry at INDEX
+ * equals COUNT. */
+#define ASSERT_REFCOUNT(INDEX, COUNT)                                 \
+    {                                                                 \
+        size_t i;                                                     \
+        munit_assert_ptr_not_null(f->log->refs);                      \
+        for (i = 0; i < f->log->refs_size; i++) {                     \
+            if (f->log->refs[i].index == INDEX) {                     \
+                munit_assert_int(f->log->refs[i].count, ==, COUNT);   \
+                break;                                                \
+            }                                                         \
+        }                                                             \
+        if (i == f->log->refs_size) {                                 \
+            munit_errorf("no refcount found for entry with index %d", \
+                         (int)INDEX);                                 \
+        }                                                             \
+    }
+
+/******************************************************************************
+ *
+ * logNumEntries
+ *
+ *****************************************************************************/
+
+SUITE(logNumEntries)
+
+/* If the log is empty, the return value is zero. */
+TEST(logNumEntries, empty, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    munit_assert_int(NUM_ENTRIES, ==, 0);
+    return MUNIT_OK;
+}
+
+/* The log is not wrapped. */
+TEST(logNumEntries, not_wrapped, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1 /* term */);
+    munit_assert_int(NUM_ENTRIES, ==, 1);
+    return MUNIT_OK;
+}
+
+/* The log is wrapped. */
+TEST(logNumEntries, wrapped, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(1 /* term */, 5 /* n entries */);
+    SNAPSHOT(4 /* last_index */, 1 /* trailing */);
+    APPEND_MANY(1 /* term */, 2 /* n entries */);
+    munit_assert_int(NUM_ENTRIES, ==, 4);
+    return MUNIT_OK;
+}
+
+/* The log has an offset and is empty. */
+TEST(logNumEntries, offset, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(1 /* term */, 5 /* n entries */);
+    SNAPSHOT(5 /* last index */, 0 /* trailing */);
+    munit_assert_int(NUM_ENTRIES, ==, 0);
+    return MUNIT_OK;
+}
+
+/* The log has an offset and is not empty. */
+TEST(logNumEntries, offsetNotEmpty, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(1 /* term */, 5 /* n entries */);
+    SNAPSHOT(4 /* last index */, 2 /* trailing */);
+    munit_assert_int(NUM_ENTRIES, ==, 3);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * logLastIndex
+ *
+ *****************************************************************************/
+
+SUITE(logLastIndex)
+
+/* If the log is empty, last index is 0. */
+TEST(logLastIndex, empty, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    munit_assert_int(LAST_INDEX, ==, 0);
+    return MUNIT_OK;
+}
+
+/* If the log is empty and has an offset, last index is calculated
+   accordingly. */
+TEST(logLastIndex, emptyWithOffset, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1);
+    SNAPSHOT(1, 0);
+    munit_assert_int(LAST_INDEX, ==, 1);
+    return MUNIT_OK;
+}
+
+/* The log has one entry. */
+TEST(logLastIndex, one, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1 /* term */);
+    munit_assert_int(LAST_INDEX, ==, 1);
+    return MUNIT_OK;
+}
+
+/* The log has two entries. */
+TEST(logLastIndex, two, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(1 /* term */, 2 /* n */);
+    munit_assert_int(LAST_INDEX, ==, 2);
+    return MUNIT_OK;
+}
+
+/* If the log starts at a certain offset, the last index is bumped
+ * accordingly. */
+TEST(logLastIndex, twoWithOffset, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(1 /* term */, 5 /* n */);
+    SNAPSHOT(5 /* last index */, 2 /* trailing */);
+    munit_assert_int(LAST_INDEX, ==, 5);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * logLastTerm
+ *
+ *****************************************************************************/
+
+SUITE(logLastTerm)
+
+/* If the log is empty, return zero. */
+TEST(logLastTerm, empty, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    munit_assert_int(LAST_TERM, ==, 0);
+    return MUNIT_OK;
+}
+
+/* If the log has a snapshot and no outstanding entries, return the last term of
+ * the snapshot. */
+TEST(logLastTerm, snapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1 /* term */);
+    SNAPSHOT(1 /* last index */, 0 /* trailing */);
+    munit_assert_int(LAST_TERM, ==, 1);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * logTermOf
+ *
+ *****************************************************************************/
+
+SUITE(logTermOf)
+
+/* If the given index is beyond the last index, return 0. */
+TEST(logTermOf, beyondLast, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    munit_assert_int(TERM_OF(2), ==, 0);
+    munit_assert_int(TERM_OF(10), ==, 0);
+    return MUNIT_OK;
+}
+
+/* If the log is empty but has a snapshot, and the given index matches the last
+ * index of the snapshot, return the snapshot last term. */
+TEST(logTermOf, snapshotLastIndex, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(1 /* term */, 5 /* n entries */);
+    SNAPSHOT(5 /* last entry */, 0 /* trailing */);
+    munit_assert_int(TERM_OF(5), ==, 1);
+    return MUNIT_OK;
+}
+
+/* The log has one entry. */
+TEST(logTermOf, one, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(3 /* term */);
+    munit_assert_int(TERM_OF(1), ==, 3);
+    return MUNIT_OK;
+}
+
+/* The log has two entries. */
+TEST(logTermOf, two, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(4 /* term */, 2 /* n */);
+    munit_assert_int(TERM_OF(1), ==, 4);
+    munit_assert_int(TERM_OF(2), ==, 4);
+    return MUNIT_OK;
+}
+
+/* The log has a snapshot and hence has an offset. */
+TEST(logTermOf, withSnapshot, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(1 /* term */, 5 /* n entries */);
+    SNAPSHOT(3 /* last index */, 0 /* trailing */);
+    munit_assert_int(TERM_OF(1), ==, 0);
+    munit_assert_int(TERM_OF(2), ==, 0);
+    munit_assert_int(TERM_OF(3), ==, 1);
+    munit_assert_int(TERM_OF(4), ==, 1);
+    munit_assert_int(TERM_OF(5), ==, 1);
+    return MUNIT_OK;
+}
+
+/* The log has a snapshot with trailing entries. */
+TEST(logTermOf, snapshotTrailing, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(1 /* term */, 5 /* n entries */);
+    SNAPSHOT(3 /* last index */, 2 /* trailing */);
+    munit_assert_int(TERM_OF(1), ==, 0);
+    munit_assert_int(TERM_OF(2), ==, 1);
+    munit_assert_int(TERM_OF(3), ==, 1);
+    munit_assert_int(TERM_OF(4), ==, 1);
+    munit_assert_int(TERM_OF(5), ==, 1);
+
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * logGet
+ *
+ *****************************************************************************/
+
+SUITE(logGet)
+
+/* The log is empty. */
+TEST(logGet, empty_log, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    munit_assert_ptr_null(GET(1));
+    return MUNIT_OK;
+}
+
+/* The log is empty but has an offset. */
+TEST(logGet, emptyWithOffset, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(4 /* term */, 10 /* n */);
+    SNAPSHOT(10 /* last index */, 0 /* trailing */);
+    munit_assert_ptr_null(GET(1));
+    munit_assert_ptr_null(GET(10));
+    munit_assert_ptr_null(GET(11));
+    return MUNIT_OK;
+}
+
+/* The log has one entry. */
+TEST(logGet, one, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(3 /* term */);
+    munit_assert_int(GET(1)->term, ==, 3);
+    munit_assert_ptr_null(GET(2));
+    return MUNIT_OK;
+}
+
+/* The log has two entries. */
+TEST(logGet, two, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(4 /* term */, 2 /* n */);
+    munit_assert_int(GET(1)->term, ==, 4);
+    munit_assert_int(GET(2)->term, ==, 4);
+    munit_assert_ptr_null(GET(3));
+    return MUNIT_OK;
+}
+
+/* The log starts at a certain offset. */
+TEST(logGet, twoWithOffset, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(1 /* term */, 3 /* n */);
+    APPEND(2 /* term */);
+    APPEND(3 /* term */);
+    SNAPSHOT(4 /* las index */, 1 /* trailing */);
+    munit_assert_ptr_null(GET(1));
+    munit_assert_ptr_null(GET(2));
+    munit_assert_ptr_null(GET(3));
+    munit_assert_int(GET(4)->term, ==, 2);
+    munit_assert_int(GET(5)->term, ==, 3);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * logAppend
+ *
+ *****************************************************************************/
+
+SUITE(logAppend)
+
+/* Append one entry to an empty log. */
+TEST(logAppend, one, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1 /* term */);
+    ASSERT(2 /* size                                                    */,
+           0 /* front                                                   */,
+           1 /* back                                                    */,
+           0 /* offset                                                  */,
+           1 /* n */);
+    ASSERT_TERM_OF(1 /* entry index */, 1 /* term */);
+    ASSERT_REFCOUNT(1 /* entry index */, 1 /* count */);
+    return MUNIT_OK;
+}
+
+/* Append two entries to to an empty log. */
+TEST(logAppend, two, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND(1 /* term */);
+    APPEND(1 /* term */);
+    ASSERT(6 /* size                                                    */,
+           0 /* front                                                   */,
+           2 /* back                                                    */,
+           0 /* offset                                                  */,
+           2 /* n */);
+    ASSERT_TERM_OF(1 /* entry index */, 1 /* term */);
+    ASSERT_TERM_OF(2 /* entry index */, 1 /* term */);
+    ASSERT_REFCOUNT(1 /* entry index */, 1 /* count */);
+    ASSERT_REFCOUNT(2 /* entry index */, 1 /* count */);
+    return MUNIT_OK;
+}
+
+/* Append three entries in sequence. */
+TEST(logAppend, three, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+
+    /* One -> [e1, NULL] */
+    APPEND(1 /* term */);
+
+    /* Two -> [e1, e2, NULL, NULL, NULL, NULL] */
+    APPEND(1 /* term */);
+
+    /* Three -> [e1, e2, e3, NULL, NULL, NULL] */
+    APPEND(1 /* term */);
+
+    ASSERT(6 /* size                                                    */,
+           0 /* front                                                   */,
+           3 /* back                                                    */,
+           0 /* offset                                                  */,
+           3 /* n */);
+    ASSERT_TERM_OF(1 /* entry index */, 1 /* term */);
+    ASSERT_TERM_OF(2 /* entry index */, 1 /* term */);
+    ASSERT_TERM_OF(3 /* entry index */, 1 /* term */);
+    ASSERT_REFCOUNT(1 /* entry index */, 1 /* count */);
+    ASSERT_REFCOUNT(2 /* entry index */, 1 /* count */);
+    ASSERT_REFCOUNT(3 /* entry index */, 1 /* count */);
+
+    return MUNIT_OK;
+}
+
+/* Append enough entries to force the reference count hash table to be
+ * resized. */
+TEST(logAppend, many, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    int i;
+    for (i = 0; i < 3000; i++) {
+        APPEND(1 /* term */);
+    }
+    munit_assert_int(f->log->refs_size, ==, 4096);
+    return MUNIT_OK;
+}
+
+/* Append to wrapped log that needs to be grown. */
+TEST(logAppend, wrap, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+
+    APPEND_MANY(1 /* term */, 5 /* n */);
+
+    /* Now the log is [e1, e2, e3, e4, e5, NULL] */
+    ASSERT(6 /* size                                                    */,
+           0 /* front                                                   */,
+           5 /* back                                                    */,
+           0 /* offset                                                  */,
+           5 /* n */);
+
+    /* Delete the first 4 entries. */
+    SNAPSHOT(4 /* last entry */, 0 /* trailing */);
+
+    /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */
+    ASSERT(6 /* size                                                    */,
+           4 /* front                                                   */,
+           5 /* back                                                    */,
+           4 /* offset                                                  */,
+           1 /* n */);
+
+    /* Append another 3 entries. */
+    APPEND_MANY(1 /* term */, 3 /* n */);
+
+    /* Now the log is [e7, e8, NULL, NULL, e5, e6] */
+    ASSERT(6 /* size                                                    */,
+           4 /* front                                                   */,
+           2 /* back                                                    */,
+           4 /* offset                                                  */,
+           4 /* n */);
+
+    /* Append another 3 entries. */
+    APPEND_MANY(1 /* term */, 3 /* n */);
+
+    /* Now the log is [e5, ..., e11, NULL, ..., NULL] */
+    ASSERT(14 /* size                                                 */,
+           0 /* front                                                 */,
+           7 /* back                                                  */,
+           4 /* offset                                                */,
+           7 /* n */);
+
+    return MUNIT_OK;
+}
+
+/* Append a batch of entries to an empty log. */
+TEST(logAppend, batch, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_BATCH(3);
+    ASSERT(6 /* size                                                 */,
+           0 /* front                                                 */,
+           3 /* back                                                  */,
+           0 /* offset                                                */,
+           3 /* n */);
+    return MUNIT_OK;
+}
+
+static char *logAppendOomHeapFaultDelay[] = {"0", "1", NULL};
+static char *logAppendOomHeapFaultRepeat[] = {"1", NULL};
+
+static MunitParameterEnum logAppendOom[] = {
+    {TEST_HEAP_FAULT_DELAY, logAppendOomHeapFaultDelay},
+    {TEST_HEAP_FAULT_REPEAT, logAppendOomHeapFaultRepeat},
+    {NULL, NULL},
+};
+
+/* Out of memory. */
+TEST(logAppend, oom, setUp, tearDown, 0, logAppendOom)
+{
+    struct fixture *f = data;
+    struct raft_buffer buf;
+    int rv;
+    buf.base = NULL;
+    buf.len = 0;
+    HeapFaultEnable(&f->heap);
+    rv = logAppend(f->log, 1, RAFT_COMMAND, &buf, NULL);
+    munit_assert_int(rv, ==, RAFT_NOMEM);
+    return MUNIT_OK;
+}
+
+/* Out of memory when trying to grow the refs count table. */
+TEST(logAppend, oomRefs, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(1, LOG__REFS_INITIAL_SIZE);
+    HeapFaultConfig(&f->heap, 1, 1);
+    HeapFaultEnable(&f->heap);
+    APPEND_ERROR(1, RAFT_NOMEM);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * logAppendConfiguration
+ *
+ *****************************************************************************/
+
+SUITE(logAppendConfiguration)
+
+static char *logAppendConfigurationOomHeapFaultDelay[] = {"0", "1", NULL};
+static char *logAppendConfigurationOomHeapFaultRepeat[] = {"1", NULL};
+
+static MunitParameterEnum logAppendConfigurationOom[] = {
+    {TEST_HEAP_FAULT_DELAY, logAppendConfigurationOomHeapFaultDelay},
+    {TEST_HEAP_FAULT_REPEAT, logAppendConfigurationOomHeapFaultRepeat},
+    {NULL, NULL},
+};
+
+/* Out of memory. */
+TEST(logAppendConfiguration, oom, setUp, tearDown, 0, logAppendConfigurationOom)
+{
+    struct fixture *f = data;
+    struct raft_configuration configuration;
+    int rv;
+
+    configurationInit(&configuration);
+    rv = configurationAdd(&configuration, 1, "1", RAFT_VOTER);
+    munit_assert_int(rv, ==, 0);
+
+    HeapFaultEnable(&f->heap);
+
+    rv = logAppendConfiguration(f->log, 1, &configuration);
+    munit_assert_int(rv, ==, RAFT_NOMEM);
+
+    configurationClose(&configuration);
+
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * logAcquire
+ *
+ *****************************************************************************/
+
+SUITE(logAcquire)
+
+/* Acquire a single log entry. */
+TEST(logAcquire, one, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry *entries;
+    unsigned n;
+    APPEND(1 /* term */);
+    ACQUIRE(1 /* index */);
+    munit_assert_ptr_not_null(entries);
+    munit_assert_int(n, ==, 1);
+    munit_assert_int(entries[0].type, ==, RAFT_COMMAND);
+    ASSERT_REFCOUNT(1 /* index */, 2 /* count */);
+    RELEASE(1 /* index */);
+    ASSERT_REFCOUNT(1 /* index */, 1 /* count */);
+    return MUNIT_OK;
+}
+
+/* Acquire two log entries. */
+TEST(logAcquire, two, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry *entries;
+    unsigned n;
+    APPEND(1 /* term */);
+    APPEND(1 /* term */);
+    ACQUIRE(1 /* index */);
+    munit_assert_ptr_not_null(entries);
+    munit_assert_int(n, ==, 2);
+    munit_assert_int(entries[0].type, ==, RAFT_COMMAND);
+    munit_assert_int(entries[1].type, ==, RAFT_COMMAND);
+    ASSERT_REFCOUNT(1 /* index */, 2 /* count */);
+    ASSERT_REFCOUNT(2 /* index */, 2 /* count */);
+    RELEASE(1 /* index */);
+    ASSERT_REFCOUNT(1 /* index */, 1 /* count */);
+    ASSERT_REFCOUNT(2 /* index */, 1 /* count */);
+    return MUNIT_OK;
+}
+
+/* Acquire two log entries in a wrapped log. */
+TEST(logAcquire, wrap, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry *entries;
+    unsigned n;
+
+    APPEND_MANY(1 /* term */, 5 /* n */);
+
+    /* Now the log is [e1, e2, e3, e4, e5, NULL] */
+    ASSERT(6 /* size                                                 */,
+           0 /* front                                                */,
+           5 /* back                                                 */,
+           0 /* offset                                               */,
+           5 /* n */);
+
+    /* Delete the first 4 entries. */
+    SNAPSHOT(4 /* last index */, 0 /* trailing */);
+
+    /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */
+    ASSERT(6 /* size                                                 */,
+           4 /* front                                                */,
+           5 /* back                                                 */,
+           4 /* offset                                               */,
+           1 /* n */);
+
+    /* Append another 3 entries. */
+    APPEND_MANY(1 /* term */, 3 /* n */);
+
+    /* Now the log is [e7, e8, NULL, NULL, e5, e6] */
+    ASSERT(6 /* size                                                 */,
+           4 /* front                                                */,
+           2 /* back                                                 */,
+           4 /* offset                                               */,
+           4 /* n */);
+
+    ACQUIRE(6 /* index */);
+    munit_assert_int(n, ==, 3);
+    RELEASE(6 /* index */);
+
+    return MUNIT_OK;
+}
+
+/* Acquire several entries some of which belong to batches. */
+TEST(logAcquire, batch, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry *entries;
+    unsigned n;
+
+    APPEND(1 /* term */);
+    APPEND_BATCH(2 /* n entries */);
+    APPEND(1 /* term */);
+    APPEND_BATCH(3 /* n entries */);
+
+    ACQUIRE(2 /* index */);
+    munit_assert_ptr_not_null(entries);
+    munit_assert_int(n, ==, 6);
+    ASSERT_REFCOUNT(2 /* index */, 2 /* count */);
+
+    /* Truncate the last 5 entries, so the only references left for the second
+     * batch are the ones in the acquired entries. */
+    TRUNCATE(3 /* index */);
+
+    RELEASE(2 /* index */);
+
+    ASSERT_REFCOUNT(2 /* index */, 1 /* count */);
+
+    return MUNIT_OK;
+}
+
+/* Trying to acquire entries out of range results in a NULL pointer. */
+TEST(logAcquire, outOfRange, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry *entries;
+    unsigned n;
+
+    APPEND(1 /* term */);
+    APPEND(1 /* term */);
+    SNAPSHOT(1 /* index */, 0 /* trailing */);
+
+    ACQUIRE(1 /* index */);
+    munit_assert_ptr_null(entries);
+    ACQUIRE(3 /* index */);
+    munit_assert_ptr_null(entries);
+
+    return MUNIT_OK;
+}
+
+/* Out of memory. */
+TEST(logAcquire, oom, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry *entries;
+    unsigned n;
+    int rv;
+
+    APPEND(1 /* term */);
+
+    HeapFaultConfig(&f->heap, 0, 1);
+    HeapFaultEnable(&f->heap);
+
+    rv = logAcquire(f->log, 1, &entries, &n);
+    munit_assert_int(rv, ==, RAFT_NOMEM);
+
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * logTruncate
+ *
+ *****************************************************************************/
+
+SUITE(logTruncate)
+
+/* Truncate the last entry of a log with a single entry. */
+TEST(logTruncate, lastOfOne, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+
+    APPEND(1 /* term */);
+    TRUNCATE(1 /* index */);
+
+    ASSERT(0 /* size                                                 */,
+           0 /* front                                                */,
+           0 /* back                                                 */,
+           0 /* offset                                               */,
+           0 /* n */);
+
+    return MUNIT_OK;
+}
+
+/* Truncate the last entry of a log with a two entries. */
+TEST(logTruncate, lastOfTwo, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+
+    APPEND(1 /* term */);
+    APPEND(1 /* term */);
+
+    TRUNCATE(2 /* index */);
+
+    ASSERT(6 /* size                                                 */,
+           0 /* front                                                */,
+           1 /* back                                                 */,
+           0 /* offset                                               */,
+           1 /* n */);
+    ASSERT_TERM_OF(1 /* entry index */, 1 /* term */);
+
+    return MUNIT_OK;
+}
+
+/* Truncate from an entry which makes the log wrap. */
+TEST(logTruncate, wrap, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+
+    APPEND_MANY(1 /* term */, 5 /* n entries */);
+
+    /* Now the log is [e1, e2, e3, e4, e5, NULL] */
+    ASSERT(6 /* size                                                 */,
+           0 /* front                                                */,
+           5 /* back                                                 */,
+           0 /* offset                                               */,
+           5 /* n */);
+
+    /* Delete the first 4 entries. */
+    SNAPSHOT(4 /* last index */, 0 /* trailing */);
+
+    /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */
+    ASSERT(6 /* size                                                 */,
+           4 /* front                                                */,
+           5 /* back                                                 */,
+           4 /* offset                                               */,
+           1 /* n */);
+
+    /* Append another 3 entries. */
+    APPEND_MANY(1 /* term */, 3 /* n entries */);
+
+    /* Now the log is [e7, e8, NULL, NULL, e5, e6] */
+    ASSERT(6 /* size                                                 */,
+           4 /* front                                                */,
+           2 /* back                                                 */,
+           4 /* offset                                               */,
+           4 /* n */);
+
+    /* Truncate from e6 onward (wrapping) */
+    TRUNCATE(6 /* index */);
+
+    /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */
+    ASSERT(6 /* size                                                 */,
+           4 /* front                                                */,
+           5 /* back                                                 */,
+           4 /* offset                                               */,
+           1 /* n */);
+
+    return MUNIT_OK;
+}
+
+/* Truncate the last entry of a log with a single entry, which still has an
+ * outstanding reference created by a call to logAcquire(). */
+TEST(logTruncate, referenced, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry *entries;
+    unsigned n;
+
+    APPEND(1 /* term */);
+    ACQUIRE(1 /* index */);
+    TRUNCATE(1 /* index */);
+
+    ASSERT(0 /* size                                                 */,
+           0 /* front                                                */,
+           0 /* back                                                 */,
+           0 /* offset                                               */,
+           0 /* n */);
+
+    /* The entry has still an outstanding reference. */
+    ASSERT_REFCOUNT(1 /* index */, 1 /* count */);
+
+    munit_assert_string_equal((const char *)entries[0].buf.base, "hello");
+
+    RELEASE(1 /* index */);
+    ASSERT_REFCOUNT(1 /* index */, 0 /* count */);
+
+    return MUNIT_OK;
+}
+
+/* Truncate all entries belonging to a batch. */
+TEST(logTruncate, batch, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_BATCH(3 /* n entries */);
+    TRUNCATE(1 /* index */);
+    munit_assert_int(f->log->size, ==, 0);
+    return MUNIT_OK;
+}
+
+/* Acquire entries at a certain index. Truncate the log at that index. The
+ * truncated entries are still referenced. Then append a new entry, which will
+ * have the same index but different term. */
+TEST(logTruncate, acquired, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry *entries;
+    unsigned n;
+
+    APPEND(1 /* term */);
+    APPEND(1 /* term */);
+    ACQUIRE(2 /* index */);
+    munit_assert_int(n, ==, 1);
+
+    TRUNCATE(2 /* index */);
+
+    APPEND(2 /* term */);
+
+    RELEASE(2 /*index */);
+
+    return MUNIT_OK;
+}
+
+/* Acquire some entries, truncate the log and then append new ones forcing the
+   log to be grown and the reference count hash table to be re-built. */
+TEST(logTruncate, acquireAppend, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    struct raft_entry *entries;
+    unsigned n;
+    size_t i;
+
+    APPEND(1 /* term */);
+    APPEND(1 /* term */);
+
+    ACQUIRE(2);
+
+    munit_assert_int(n, ==, 1);
+
+    TRUNCATE(2);
+
+    for (i = 0; i < LOG__REFS_INITIAL_SIZE; i++) {
+        APPEND(2 /* term */);
+    }
+
+    RELEASE(2);
+
+    return MUNIT_OK;
+}
+
+static char *logTruncateAcquiredHeapFaultDelay[] = {"0", NULL};
+static char *logTruncateAcquiredFaultRepeat[] = {"1", NULL};
+
+static MunitParameterEnum logTruncateAcquiredOom[] = {
+    {TEST_HEAP_FAULT_DELAY, logTruncateAcquiredHeapFaultDelay},
+    {TEST_HEAP_FAULT_REPEAT, logTruncateAcquiredFaultRepeat},
+    {NULL, NULL},
+};
+
+/* Acquire entries at a certain index. Truncate the log at that index. The
+ * truncated entries are still referenced. Then append a new entry, which fails
+ * to be appended due to OOM. */
+TEST(logTruncate, acquiredOom, setUp, tearDown, 0, logTruncateAcquiredOom)
+{
+    struct fixture *f = data;
+    struct raft_entry *entries;
+    unsigned n;
+    struct raft_buffer buf;
+    int rv;
+
+    APPEND(1 /* term */);
+    APPEND(1 /* term */);
+
+    ACQUIRE(2);
+    munit_assert_int(n, ==, 1);
+
+    TRUNCATE(2);
+
+    buf.base = NULL;
+    buf.len = 0;
+
+    HeapFaultEnable(&f->heap);
+
+    rv = logAppend(f->log, 2, RAFT_COMMAND, &buf, NULL);
+    munit_assert_int(rv, ==, RAFT_NOMEM);
+
+    RELEASE(2);
+
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * logSnapshot
+ *
+ *****************************************************************************/
+
+SUITE(logSnapshot)
+
+/* Take a snapshot at entry 3, keeping 2 trailing entries. */
+TEST(logSnapshot, trailing, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+
+    APPEND(1 /* term */);
+    APPEND(2 /* term */);
+    APPEND(2 /* term */);
+
+    SNAPSHOT(3 /* last index */, 2 /* trailing */);
+
+    ASSERT(6 /* size                                                 */,
+           1 /* front                                                */,
+           3 /* back                                                 */,
+           1 /* offset                                               */,
+           2 /* n */);
+
+    ASSERT_SNAPSHOT(3 /* index */, 2 /* term */);
+
+    munit_assert_int(NUM_ENTRIES, ==, 2);
+    munit_assert_int(LAST_INDEX, ==, 3);
+
+    return MUNIT_OK;
+}
+
+/* Take a snapshot when the number of outstanding entries is lower than the
+ * desired trail (so no entry will be deleted). */
+TEST(logSnapshot, trailingHigherThanNumEntries, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+
+    /* Take a snapshot leaving just one entry in the log. */
+    APPEND_MANY(1 /* term */, 3 /* n entries */);
+    SNAPSHOT(3 /* last index */, 1 /* trailing */);
+
+    /* Take another snapshot, trying to leave 3 entries, but only 2 are
+     * available at all. */
+    APPEND(2 /* term */);
+
+    SNAPSHOT(4 /* last index */, 3 /* trailing */);
+
+    ASSERT(6 /* size                                                 */,
+           2 /* front                                                */,
+           4 /* back                                                 */,
+           2 /* offset                                               */,
+           2 /* n */);
+
+    ASSERT_SNAPSHOT(4 /* index */, 2 /* term */);
+
+    munit_assert_int(NUM_ENTRIES, ==, 2);
+    munit_assert_int(LAST_INDEX, ==, 4);
+
+    return MUNIT_OK;
+}
+
+/* Take a snapshot when the number of outstanding entries is exactly equal to
+ * the desired trail (so no entry will be deleted). */
+TEST(logSnapshot, trailingMatchesOutstanding, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+
+    /* Take a snapshot leaving just one entry in the log. */
+    APPEND_MANY(1 /* term */, 3 /* n entries */);
+    SNAPSHOT(3 /* last index */, 1 /* trailing */);
+
+    /* Take another snapshot, leaving 2 entries, which are the ones we have. */
+    APPEND(2 /* term */);
+
+    SNAPSHOT(4 /* last index */, 2 /* trailing */);
+
+    ASSERT(6 /* size                                                 */,
+           2 /* front                                                */,
+           4 /* back                                                 */,
+           2 /* offset                                               */,
+           2 /* n */);
+
+    ASSERT_SNAPSHOT(4 /* index */, 2 /* term */);
+
+    munit_assert_int(NUM_ENTRIES, ==, 2);
+    munit_assert_int(LAST_INDEX, ==, 4);
+
+    return MUNIT_OK;
+}
+
+/* Take a snapshot at an index which is not the last one. */
+TEST(logSnapshot, lessThanHighestIndex, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+
+    /* Take a snapshot leaving three entries in the log. */
+    APPEND_MANY(1 /* term */, 5 /* n entries */);
+    SNAPSHOT(4 /* last index */, 2 /* trailing */);
+
+    ASSERT(6 /* size                                                 */,
+           2 /* front                                                */,
+           5 /* back                                                 */,
+           2 /* offset                                               */,
+           3 /* n */);
+
+    ASSERT_SNAPSHOT(4 /* index */, 1 /* term */);
+
+    munit_assert_int(NUM_ENTRIES, ==, 3);
+    munit_assert_int(LAST_INDEX, ==, 5);
+
+    return MUNIT_OK;
+}
+
+/* Take a snapshot at a point where the log needs to wrap. */
+TEST(logSnapshot, wrap, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+
+    APPEND_MANY(1 /* term */, 5 /* n entries */);
+
+    /* Now the log is [e1, e2, e3, e4, e5, NULL] */
+    ASSERT(6 /* size                                                 */,
+           0 /* front                                                */,
+           5 /* back                                                 */,
+           0 /* offset                                               */,
+           5 /* n */);
+
+    /* Take a snapshot at e5, keeping just e5 itself. */
+    SNAPSHOT(5 /* last index */, 1 /* trailing */);
+
+    /* Now the log is [NULL, NULL, NULL, NULL, e5, NULL] */
+    ASSERT(6 /* size                                                 */,
+           4 /* front                                                */,
+           5 /* back                                                 */,
+           4 /* offset                                               */,
+           1 /* n */);
+
+    ASSERT_SNAPSHOT(5 /* index */, 1 /* term */);
+
+    /* Append another 4 entries. */
+    APPEND_MANY(1 /* term */, 4 /* n */);
+
+    /* Now the log is [e7, e8, e9, NULL, e5, e6] */
+    ASSERT(6 /* size                                                 */,
+           4 /* front                                                */,
+           3 /* back                                                 */,
+           4 /* offset                                               */,
+           5 /* n */);
+
+    /* Take a snapshot at e8 keeping only e8 itself (wrapping) */
+    SNAPSHOT(8 /* last index */, 1 /* trailing */);
+
+    /* Now the log is [NULL, e8, e9, NULL, NULL, NULL] */
+    ASSERT(6 /* size                                                 */,
+           1 /* front                                                */,
+           3 /* back                                                 */,
+           7 /* offset                                               */,
+           2 /* n */);
+
+    ASSERT_SNAPSHOT(8 /* index */, 1 /* term */);
+
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * logRestore
+ *
+ *****************************************************************************/
+
+SUITE(logRestore)
+
+/* Mimic the initial restore of a snapshot after loading state from disk, when
+ * there are no outstanding entries. */
+TEST(logRestore, initial, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    RESTORE(2 /* last index */, 3 /* last term */);
+    ASSERT_SNAPSHOT(2 /* index */, 3 /* term */);
+    munit_assert_int(LAST_INDEX, ==, 2);
+    return MUNIT_OK;
+}
+
+/* If there are existing entries they are wiped out. */
+TEST(logRestore, wipe, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    APPEND_MANY(1 /* term */, 5 /* n entries */);
+    RESTORE(2 /* last index */, 3 /* last term */);
+    ASSERT_SNAPSHOT(2 /* index */, 3 /* term */);
+    munit_assert_int(LAST_INDEX, ==, 2);
+    return MUNIT_OK;
+}
diff --git a/test/raft/unit/test_queue.c b/test/raft/unit/test_queue.c
new file mode 100644
index 000000000..aee0f0a4d
--- /dev/null
+++ b/test/raft/unit/test_queue.c
@@ -0,0 +1,260 @@
+#include "../../../src/raft/queue.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture with a single queue and a few test items that can be added to it.
+ *
+ *****************************************************************************/
+
+struct item
+{
+    int value;
+    queue queue;
+};
+
+struct fixture
+{
+    queue queue;
+    struct item items[3];
+};
+
+static void *setUp(MUNIT_UNUSED const MunitParameter params[],
+                   MUNIT_UNUSED void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    QUEUE_INIT(&f->queue);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    free(f);
+}
+
+/******************************************************************************
+ *
+ * Helper macros
+ *
+ *****************************************************************************/
+
+/* Initialize and push the given number of fixture items to the fixture's
+ * queue. Each item will have a value equal to its index plus one. */
+#define PUSH(N)                                   \
+    {                                             \
+        int i_;                                   \
+        for (i_ = 0; i_ < N; i_++) {              \
+            struct item *item_ = &f->items[i_];   \
+            item_->value = i_ + 1;                \
+            QUEUE_PUSH(&f->queue, &item_->queue); \
+        }                                         \
+    }
+
+/* Remove the i'th fixture item from the fixture queue. */
+#define REMOVE(I) QUEUE_REMOVE(&f->items[I].queue)
+
+/******************************************************************************
+ *
+ * Assertions
+ *
+ *****************************************************************************/
+
+/* Assert that the item at the head of the fixture's queue has the given
+ * value. */
+#define ASSERT_HEAD(VALUE)                             \
+    {                                                  \
+        queue *head_ = QUEUE_HEAD(&f->queue);          \
+        struct item *item_;                            \
+        item_ = QUEUE_DATA(head_, struct item, queue); \
+        munit_assert_int(item_->value, ==, VALUE);     \
+    }
+
+/* Assert that the item at the tail of the queue has the given value. */
+#define ASSERT_TAIL(VALUE)                             \
+    {                                                  \
+        queue *tail_ = QUEUE_TAIL(&f->queue);          \
+        struct item *item_;                            \
+        item_ = QUEUE_DATA(tail_, struct item, queue); \
+        munit_assert_int(item_->value, ==, VALUE);     \
+    }
+
+/* Assert that the fixture's queue is empty. */
+#define ASSERT_EMPTY munit_assert_true(QUEUE_IS_EMPTY(&f->queue))
+
+/* Assert that the fixture's queue is not empty. */
+#define ASSERT_NOT_EMPTY munit_assert_false(QUEUE_IS_EMPTY(&f->queue))
+
+/******************************************************************************
+ *
+ * QUEUE_IS_EMPTY
+ *
+ *****************************************************************************/
+
+SUITE(QUEUE_IS_EMPTY)
+
+TEST(QUEUE_IS_EMPTY, yes, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    ASSERT_EMPTY;
+    return MUNIT_OK;
+}
+
+TEST(QUEUE_IS_EMPTY, no, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    PUSH(1);
+    ASSERT_NOT_EMPTY;
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * QUEUE_PUSH
+ *
+ *****************************************************************************/
+
+SUITE(QUEUE_PUSH)
+
+TEST(QUEUE_PUSH, one, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    PUSH(1);
+    ASSERT_HEAD(1);
+    return MUNIT_OK;
+}
+
+TEST(QUEUE_PUSH, two, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    int i;
+    PUSH(2);
+    for (i = 0; i < 2; i++) {
+        ASSERT_HEAD(i + 1);
+        REMOVE(i);
+    }
+    ASSERT_EMPTY;
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * QUEUE_REMOVE
+ *
+ *****************************************************************************/
+
+SUITE(QUEUE_REMOVE)
+
+TEST(QUEUE_REMOVE, first, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    PUSH(3);
+    REMOVE(0);
+    ASSERT_HEAD(2);
+    return MUNIT_OK;
+}
+
+TEST(QUEUE_REMOVE, second, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    PUSH(3);
+    REMOVE(1);
+    ASSERT_HEAD(1);
+    return MUNIT_OK;
+}
+
+TEST(QUEUE_REMOVE, success, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    PUSH(3);
+    REMOVE(2);
+    ASSERT_HEAD(1);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * QUEUE_TAIL
+ *
+ *****************************************************************************/
+
+SUITE(QUEUE_TAIL)
+
+TEST(QUEUE_TAIL, one, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    PUSH(1);
+    ASSERT_TAIL(1);
+    return MUNIT_OK;
+}
+
+TEST(QUEUE_TAIL, two, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    PUSH(2);
+    ASSERT_TAIL(2);
+    return MUNIT_OK;
+}
+
+TEST(QUEUE_TAIL, three, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    PUSH(3);
+    ASSERT_TAIL(3);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * QUEUE_FOREACH
+ *
+ *****************************************************************************/
+
+SUITE(QUEUE_FOREACH)
+
+/* Loop through a queue of zero items. */
+TEST(QUEUE_FOREACH, zero, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    queue *head;
+    int count = 0;
+    QUEUE_FOREACH (head, &f->queue) {
+        count++;
+    }
+    munit_assert_int(count, ==, 0);
+    return MUNIT_OK;
+}
+
+/* Loop through a queue of one item. */
+TEST(QUEUE_FOREACH, one, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    queue *head;
+    int count = 0;
+    PUSH(1);
+    QUEUE_FOREACH (head, &f->queue) {
+        count++;
+    }
+    munit_assert_int(count, ==, 1);
+    return MUNIT_OK;
+}
+
+/* Loop through a queue of two items. The order of the loop is from the head to
+ * the tail. */
+TEST(QUEUE_FOREACH, two, setUp, tearDown, 0, NULL)
+{
+    struct fixture *f = data;
+    queue *head;
+    int values[2] = {0, 0};
+    int i = 0;
+    PUSH(2);
+    QUEUE_FOREACH (head, &f->queue) {
+        struct item *item;
+        item = QUEUE_DATA(head, struct item, queue);
+        values[i] = item->value;
+        i++;
+    }
+    munit_assert_int(values[0], ==, 1);
+    munit_assert_int(values[1], ==, 2);
+    return MUNIT_OK;
+}
diff --git a/test/raft/unit/test_uv_fs.c b/test/raft/unit/test_uv_fs.c
new file mode 100644
index 000000000..a72206dc9
--- /dev/null
+++ b/test/raft/unit/test_uv_fs.c
@@ -0,0 +1,473 @@
+#include <unistd.h>
+
+#include "../../../src/raft/uv_fs.h"
+#include "../../../src/raft/uv_os.h"
+#include "../lib/aio.h"
+#include "../lib/dir.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * UvFsCheckDir
+ *
+ *****************************************************************************/
+
+/* Invoke UvFsCheckDir passing it the given dir. */
+#define CHECK_DIR(DIR)                      \
+    {                                       \
+        int _rv;                            \
+        char _errmsg[RAFT_ERRMSG_BUF_SIZE]; \
+        _rv = UvFsCheckDir(DIR, _errmsg);   \
+        munit_assert_int(_rv, ==, 0);       \
+    }
+
+/* Invoke UvFsCheckDir passing it the given dir and check that the given error
+ * occurs. */
+#define CHECK_DIR_ERROR(DIR, RV, ERRMSG)            \
+    {                                               \
+        int _rv;                                    \
+        char _errmsg[RAFT_ERRMSG_BUF_SIZE];         \
+        _rv = UvFsCheckDir(DIR, _errmsg);           \
+        munit_assert_int(_rv, ==, RV);              \
+        munit_assert_string_equal(_errmsg, ERRMSG); \
+    }
+
+SUITE(UvFsCheckDir)
+
+/* If the directory exists, the function succeeds. */
+TEST(UvFsCheckDir, exists, DirSetUp, DirTearDown, 0, NULL)
+{
+    const char *dir = data;
+    CHECK_DIR(dir);
+    return MUNIT_OK;
+}
+
+/* If the directory doesn't exist, it an error is returned. */
+TEST(UvFsCheckDir, doesNotExist, DirSetUp, DirTearDown, 0, NULL)
+{
+    const char *parent = data;
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    char dir[128];
+    sprintf(errmsg, "%s/sub", parent);
+    sprintf(errmsg, "directory '%s' does not exist", dir);
+    CHECK_DIR_ERROR(dir, RAFT_NOTFOUND, errmsg);
+    return MUNIT_OK;
+}
+
+/* If the process can't access the directory, an error is returned. */
+TEST(UvFsCheckDir, permissionDenied, NULL, NULL, 0, NULL)
+{
+    bool has_access = DirHasFile("/proc/1", "root");
+    /* Skip the test is the process actually has access to /proc/1/root. */
+    if (has_access) {
+        return MUNIT_SKIP;
+    }
+    CHECK_DIR_ERROR("/proc/1/root", RAFT_UNAUTHORIZED,
+                    "can't access directory '/proc/1/root'");
+    return MUNIT_OK;
+}
+
+/* If the given path contains a non-directory prefix, an error is returned. */
+TEST(UvFsCheckDir, notDirPrefix, NULL, NULL, 0, NULL)
+{
+    CHECK_DIR_ERROR("/dev/null/foo", RAFT_INVALID,
+                    "path '/dev/null/foo' is not a directory");
+    return MUNIT_OK;
+}
+
+/* If the given path is not a directory, an error is returned. */
+TEST(UvFsCheckDir, notDir, NULL, NULL, 0, NULL)
+{
+    CHECK_DIR_ERROR("/dev/null", RAFT_INVALID,
+                    "path '/dev/null' is not a directory");
+    return MUNIT_OK;
+}
+
+/* If the given directory is not writable, an error is returned. */
+TEST(UvFsCheckDir, notWritable, DirSetUp, DirTearDown, 0, NULL)
+{
+    const char *dir = data;
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    sprintf(errmsg, "directory '%s' is not writable", dir);
+    DirMakeUnwritable(dir);
+    CHECK_DIR_ERROR(dir, RAFT_INVALID, errmsg);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * UvFsSyncDir
+ *
+ *****************************************************************************/
+
+/* Invoke UvFsSyncDir passing it the given dir. */
+#define SYNC_DIR_ERROR(DIR, RV, ERRMSG)                      \
+    {                                                        \
+        char _errmsg[RAFT_ERRMSG_BUF_SIZE];                  \
+        munit_assert_int(UvFsSyncDir(DIR, _errmsg), ==, RV); \
+        munit_assert_string_equal(_errmsg, ERRMSG);          \
+    }
+
+SUITE(UvFsSyncDir)
+
+/* If the directory doesn't exist, an error is returned. */
+TEST(UvFsSyncDir, noExists, NULL, NULL, 0, NULL)
+{
+    SYNC_DIR_ERROR("/abcdef", RAFT_IOERR,
+                   "open directory: no such file or directory");
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * UvFsOpenFileForReading
+ *
+ *****************************************************************************/
+
+/* Open a file in the given dir. */
+#define OPEN_FILE_FOR_READING_ERROR(DIR, FILENAME, RV, ERRMSG)          \
+    {                                                                   \
+        uv_file fd_;                                                    \
+        char errmsg_[RAFT_ERRMSG_BUF_SIZE];                             \
+        int rv_ = UvFsOpenFileForReading(DIR, FILENAME, &fd_, errmsg_); \
+        munit_assert_int(rv_, ==, RV);                                  \
+        munit_assert_string_equal(errmsg_, ERRMSG);                     \
+    }
+
+SUITE(UvFsOpenFileForReading)
+
+/* If the directory doesn't exist, an error is returned. */
+TEST(UvFsOpenFileForReading, noExists, DirSetUp, DirTearDown, 0, NULL)
+{
+    const char *dir = data;
+    OPEN_FILE_FOR_READING_ERROR(dir, "foo", RAFT_IOERR,
+                                "open: no such file or directory");
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * UvFsAllocateFile
+ *
+ *****************************************************************************/
+
+#define FALLOCATE_PARAM "fallocate"
+static char *fallocate_params[] = {"1", "0", NULL};
+MunitParameterEnum fallocateParams[] = {
+    {FALLOCATE_PARAM, fallocate_params},
+    {NULL, NULL},
+};
+
+/* Allocate a file with the given parameters and assert that no error occurred.
+ */
+#define ALLOCATE_FILE(DIR, FILENAME, SIZE)                                     \
+    {                                                                          \
+        uv_file fd_;                                                           \
+        char errmsg_;                                                          \
+        int rv_;                                                               \
+        bool fallocate_ = true;                                                \
+        const char *f = munit_parameters_get(params, FALLOCATE_PARAM);         \
+        if (f != NULL) {                                                       \
+            fallocate_ = atoi(f);                                              \
+        }                                                                      \
+        rv_ =                                                                  \
+            UvFsAllocateFile(DIR, FILENAME, SIZE, &fd_, fallocate_, &errmsg_); \
+        munit_assert_int(rv_, ==, 0);                                          \
+        munit_assert_int(UvOsClose(fd_), ==, 0);                               \
+    }
+
+/* Assert that creating a file with the given parameters fails with the given
+ * code and error message. */
+#define ALLOCATE_FILE_ERROR(DIR, FILENAME, SIZE, RV, ERRMSG)                  \
+    {                                                                         \
+        uv_file fd_;                                                          \
+        char errmsg_[RAFT_ERRMSG_BUF_SIZE];                                   \
+        int rv_;                                                              \
+        bool fallocate_ = true;                                               \
+        const char *f = munit_parameters_get(params, FALLOCATE_PARAM);        \
+        if (f != NULL) {                                                      \
+            fallocate_ = atoi(f);                                             \
+        }                                                                     \
+        rv_ =                                                                 \
+            UvFsAllocateFile(DIR, FILENAME, SIZE, &fd_, fallocate_, errmsg_); \
+        munit_assert_int(rv_, ==, RV);                                        \
+        munit_assert_string_equal(errmsg_, ERRMSG);                           \
+    }
+
+SUITE(UvFsAllocateFile)
+
+/* If the given path is valid, the file gets created. */
+TEST(UvFsAllocateFile, success, DirSetUp, DirTearDown, 0, fallocateParams)
+{
+    const char *dir = data;
+    ALLOCATE_FILE(dir,   /* dir */
+                  "foo", /* filename */
+                  4096 /* size */);
+    munit_assert_true(DirHasFile(dir, "foo"));
+    return MUNIT_OK;
+}
+
+/* The directory of given path does not exist, an error is returned. */
+TEST(UvFsAllocateFile, dirNoExists, NULL, NULL, 0, fallocateParams)
+{
+    ALLOCATE_FILE_ERROR("/non/existing/dir", /* dir */
+                        "foo",               /* filename */
+                        64,                  /* size */
+                        RAFT_IOERR,          /* status */
+                        "open: no such file or directory");
+    return MUNIT_OK;
+}
+
+/* If the given path already exists, an error is returned. */
+TEST(UvFsAllocateFile,
+     fileAlreadyExists,
+     DirSetUp,
+     DirTearDown,
+     0,
+     fallocateParams)
+{
+    const char *dir = data;
+    char buf[8] = {0};
+    DirWriteFile(dir, "foo", buf, sizeof buf);
+    ALLOCATE_FILE_ERROR(dir,        /* dir */
+                        "foo",      /* filename */
+                        64,         /* size */
+                        RAFT_IOERR, /* status */
+                        "open: file already exists");
+    return MUNIT_OK;
+}
+
+static char *dirTmpfs_params[] = {"tmpfs", NULL};
+
+MunitParameterEnum noSpaceParams[] = {
+    {DIR_FS_PARAM, dirTmpfs_params},
+    {"fallocate", fallocate_params},
+    {NULL, NULL},
+};
+
+/* The file system has run out of space. */
+TEST(UvFsAllocateFile, noSpace, DirSetUp, DirTearDown, 0, noSpaceParams)
+{
+    const char *dir = data;
+    if (dir == NULL) {
+        return MUNIT_SKIP;
+    }
+    ALLOCATE_FILE_ERROR(dir,          /* dir */
+                        "foo",        /* filename */
+                        4096 * 32768, /* size */
+                        RAFT_NOSPACE, /* status */
+                        "not enough space to allocate 134217728 bytes");
+    munit_assert_false(DirHasFile(dir, "foo"));
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * UvFsProbeCapabilities
+ *
+ *****************************************************************************/
+
+/* Invoke UvFsProbeCapabilities against the given dir and assert that it returns
+ * the given values for direct I/O and async I/O. */
+#define PROBE_CAPABILITIES(DIR, DIRECT_IO, ASYNC_IO, FALLOCATE)                \
+    {                                                                          \
+        size_t direct_io_;                                                     \
+        bool async_io_;                                                        \
+        bool fallocate_;                                                       \
+        char errmsg_[RAFT_ERRMSG_BUF_SIZE];                                    \
+        int rv_;                                                               \
+        rv_ = UvFsProbeCapabilities(DIR, &direct_io_, &async_io_, &fallocate_, \
+                                    errmsg_);                                  \
+        munit_assert_int(rv_, ==, 0);                                          \
+        munit_assert_size(direct_io_, ==, DIRECT_IO);                          \
+        munit_assert_int(fallocate_, ==, FALLOCATE);                           \
+        if (ASYNC_IO) {                                                        \
+            munit_assert_true(async_io_);                                      \
+        } else {                                                               \
+            munit_assert_false(async_io_);                                     \
+        }                                                                      \
+    }
+
+/* Invoke UvFsProbeCapabilities and check that the given error occurs. */
+#define PROBE_CAPABILITIES_ERROR(DIR, RV, ERRMSG)                              \
+    {                                                                          \
+        size_t direct_io_;                                                     \
+        bool async_io_;                                                        \
+        bool fallocate_;                                                       \
+        char errmsg_[RAFT_ERRMSG_BUF_SIZE];                                    \
+        int rv_;                                                               \
+        rv_ = UvFsProbeCapabilities(DIR, &direct_io_, &async_io_, &fallocate_, \
+                                    errmsg_);                                  \
+        munit_assert_int(rv_, ==, RV);                                         \
+        munit_assert_string_equal(errmsg_, ERRMSG);                            \
+    }
+
+SUITE(UvFsProbeCapabilities)
+
+TEST(UvFsProbeCapabilities, tmpfs, DirTmpfsSetUp, DirTearDown, 0, NULL)
+{
+    const char *dir = data;
+    if (dir == NULL) {
+        return MUNIT_SKIP;
+    }
+    PROBE_CAPABILITIES(dir, 0, false, true);
+    return MUNIT_OK;
+}
+
+/* ZFS 0.8 reports that it supports direct I/O, but does not support fully
+ * support asynchronous kernel AIO. */
+TEST(UvFsProbeCapabilities, zfsDirectIO, DirZfsSetUp, DirTearDown, 0, NULL)
+{
+    const char *dir = data;
+    size_t direct_io = 0;
+#if defined(RAFT_HAVE_ZFS_WITH_DIRECT_IO)
+    direct_io = 4096;
+#endif
+    if (dir == NULL) {
+        return MUNIT_SKIP;
+    }
+    PROBE_CAPABILITIES(dir, direct_io, false, true);
+    return MUNIT_OK;
+}
+
+/* File systems that fully support DIO. */
+TEST(UvFsProbeCapabilities, aio, DirSetUp, DirTearDown, 0, DirAioParams)
+{
+    const char *dir = data;
+    if (dir == NULL) {
+        return MUNIT_SKIP;
+    }
+    /* FIXME: btrfs doesn't like that we perform a first write to the probe file
+     * to detect the direct I/O buffer size. */
+    if (strcmp(munit_parameters_get(params, DIR_FS_PARAM), "btrfs") == 0) {
+        return MUNIT_SKIP;
+    }
+    PROBE_CAPABILITIES(dir, 4096, true, true);
+    return MUNIT_OK;
+}
+
+/* If the given path is not executable, the block size of the underlying file
+ * system can't be determined and an error is returned. */
+TEST(UvFsProbeCapabilities, noAccess, DirSetUp, DirTearDown, 0, NULL)
+{
+    const char *dir = data;
+
+    /* Skip the test when running as root, since EACCES would not be triggered
+     * in that case. */
+    if (getuid() == 0) {
+        return MUNIT_SKIP;
+    }
+
+    DirMakeUnexecutable(dir);
+    PROBE_CAPABILITIES_ERROR(
+        dir, RAFT_IOERR,
+        "create I/O capabilities probe file: open: permission denied");
+
+    return MUNIT_OK;
+}
+
+/* No space is left on the target device. */
+TEST(UvFsProbeCapabilities, noSpace, DirTmpfsSetUp, DirTearDown, 0, NULL)
+{
+    const char *dir = data;
+    if (dir == NULL) {
+        return MUNIT_SKIP;
+    }
+    DirFill(dir, 0);
+    PROBE_CAPABILITIES_ERROR(dir, RAFT_NOSPACE,
+                             "create I/O capabilities probe file: not enough "
+                             "space to allocate 4096 bytes");
+    return MUNIT_OK;
+}
+
+/* The uvIoSetup() call fails with EAGAIN. */
+TEST(UvFsProbeCapabilities, noResources, DirBtrfsSetUp, DirTearDown, 0, NULL)
+{
+    const char *dir = data;
+    aio_context_t ctx = 0;
+    int rv;
+    if (dir == NULL) {
+        return MUNIT_SKIP;
+    }
+    rv = AioFill(&ctx, 0);
+    if (rv != 0) {
+        return MUNIT_SKIP;
+    }
+    PROBE_CAPABILITIES_ERROR(
+        dir, RAFT_IOERR,
+        "probe Async I/O: io_setup: resource temporarily unavailable");
+    AioDestroy(ctx);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * UvFsMakeFile
+ *
+ *****************************************************************************/
+
+SUITE(UvFsMakeFile)
+
+/* If the file does not exist, the function succeeds. */
+TEST(UvFsMakeFile, notExists, DirSetUp, DirTearDown, 0, NULL)
+{
+    const char *dir = data;
+    int rv;
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    struct raft_buffer bufs[2] = {{0}, {0}};
+    rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg);
+    munit_assert_int(rv, ==, 0);
+    return MUNIT_OK;
+}
+
+/* If the file exists, the function does not succeed. */
+TEST(UvFsMakeFile, exists, DirSetUp, DirTearDown, 0, NULL)
+{
+    const char *dir = data;
+    int rv;
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    struct raft_buffer bufs[2] = {{0}, {0}};
+    rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg);
+    munit_assert_int(rv, ==, 0);
+    rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg);
+    munit_assert_int(rv, !=, 0);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * UvFsRenameFile
+ *
+ *****************************************************************************/
+
+SUITE(UvFsRenameFile)
+
+TEST(UvFsRenameFile, rename, DirSetUp, DirTearDown, 0, NULL)
+{
+    const char *dir = data;
+    int rv;
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    struct raft_buffer bufs[2] = {{0}, {0}};
+    rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg);
+    munit_assert_int(rv, ==, 0);
+    rv = UvFsRenameFile(dir, "foo", "bar", errmsg);
+    munit_assert_int(rv, ==, 0);
+    munit_assert_false(DirHasFile(dir, "foo"));
+    munit_assert_true(DirHasFile(dir, "bar"));
+    return MUNIT_OK;
+}
+
+/* rename to same name */
+TEST(UvFsRenameFile, same, DirSetUp, DirTearDown, 0, NULL)
+{
+    const char *dir = data;
+    int rv;
+    char errmsg[RAFT_ERRMSG_BUF_SIZE];
+    struct raft_buffer bufs[2] = {{0}, {0}};
+    rv = UvFsMakeFile(dir, "foo", bufs, 2, errmsg);
+    munit_assert_int(rv, ==, 0);
+    rv = UvFsRenameFile(dir, "foo", "foo", errmsg);
+    munit_assert_int(rv, ==, 0);
+    munit_assert_true(DirHasFile(dir, "foo"));
+    return MUNIT_OK;
+}
diff --git a/test/raft/unit/test_uv_os.c b/test/raft/unit/test_uv_os.c
new file mode 100644
index 000000000..ebbe5f7f1
--- /dev/null
+++ b/test/raft/unit/test_uv_os.c
@@ -0,0 +1,85 @@
+#include "../../../src/raft/uv_os.h"
+#include "../lib/runner.h"
+
+SUITE(UvOsJoin)
+
+/* dir and filename have sensible lengths */
+TEST(UvOsJoin, basic, NULL, NULL, 0, NULL)
+{
+    int rv;
+    const char *dir = "/home";
+    const char *filename = "testfile";
+    char path[UV__PATH_SZ];
+    rv = UvOsJoin(dir, filename, path);
+    munit_assert_int(rv, ==, 0);
+    munit_assert_string_equal(path, "/home/testfile");
+    return MUNIT_OK;
+}
+
+TEST(UvOsJoin, dirTooLong, NULL, NULL, 0, NULL)
+{
+    int rv;
+    char path[UV__PATH_SZ];
+    char dir[UV__DIR_LEN + 2]; /* Room for '\0' and then 1 char over limit. */
+    memset((char *)dir, '/', sizeof(dir));
+    dir[sizeof(dir) - 1] = '\0';
+    const char *filename = "testfile";
+
+    rv = UvOsJoin(dir, filename, path);
+    munit_assert_int(rv, !=, 0);
+    return MUNIT_OK;
+}
+
+TEST(UvOsJoin, filenameTooLong, NULL, NULL, 0, NULL)
+{
+    int rv;
+    char path[UV__PATH_SZ];
+    const char *dir = "testdir";
+    char filename[UV__FILENAME_LEN + 2];
+    memset((char *)filename, 'a', sizeof(filename));
+    filename[sizeof(filename) - 1] = '\0';
+
+    rv = UvOsJoin(dir, filename, path);
+    munit_assert_int(rv, !=, 0);
+    return MUNIT_OK;
+}
+
+TEST(UvOsJoin, dirAndFilenameTooLong, NULL, NULL, 0, NULL)
+{
+    int rv;
+    /* +2 to silence compilers that complain that dir & filename would overflow
+     * path, but it's strictly not needed and doesn't influence the test. */
+    char path[UV__PATH_SZ + 2];
+    char dir[UV__DIR_LEN + 2];
+    memset((char *)dir, '/', sizeof(dir));
+    dir[sizeof(dir) - 1] = '\0';
+
+    char filename[UV__FILENAME_LEN + 2];
+    memset((char *)filename, 'a', sizeof(filename));
+    filename[sizeof(filename) - 1] = '\0';
+
+    rv = UvOsJoin(dir, filename, path);
+    munit_assert_int(rv, !=, 0);
+    return MUNIT_OK;
+}
+
+TEST(UvOsJoin, dirAndFilenameMax, NULL, NULL, 0, NULL)
+{
+    int rv;
+    char path[UV__PATH_SZ];
+    char dir[UV__DIR_LEN + 1];
+    memset((char *)dir, '/', sizeof(dir));
+    dir[sizeof(dir) - 1] = '\0';
+
+    char filename[UV__FILENAME_LEN + 1];
+    memset((char *)filename, 'a', sizeof(filename));
+    filename[sizeof(filename) - 1] = '\0';
+
+    rv = UvOsJoin(dir, filename, path);
+    munit_assert_int(rv, ==, 0);
+    char cmp_path[UV__DIR_LEN + UV__FILENAME_LEN + 1 + 1];
+    snprintf(cmp_path, UV__DIR_LEN + UV__FILENAME_LEN + 1 + 1, "%s/%s", dir,
+             filename);
+    munit_assert_string_equal(path, cmp_path);
+    return MUNIT_OK;
+}
diff --git a/test/raft/unit/test_uv_writer.c b/test/raft/unit/test_uv_writer.c
new file mode 100644
index 000000000..27ac4d665
--- /dev/null
+++ b/test/raft/unit/test_uv_writer.c
@@ -0,0 +1,391 @@
+#include "../../../src/raft/uv_fs.h"
+#include "../../../src/raft/uv_writer.h"
+#include "../lib/aio.h"
+#include "../lib/dir.h"
+#include "../lib/loop.h"
+#include "../lib/runner.h"
+
+/******************************************************************************
+ *
+ * Fixture with a UvWriter and an open file ready for writing.
+ *
+ *****************************************************************************/
+
+struct fixture
+{
+    FIXTURE_DIR;
+    FIXTURE_LOOP;
+    int fd;
+    size_t block_size;
+    size_t direct_io;
+    bool fallocate;
+    bool async_io;
+    char errmsg[256];
+    struct UvWriter writer;
+    bool closed;
+};
+
+/******************************************************************************
+ *
+ * Helper macros.
+ *
+ *****************************************************************************/
+
+struct result
+{
+    int status;
+    bool done;
+};
+
+static void closeCb(struct UvWriter *writer)
+{
+    struct fixture *f = writer->data;
+    f->closed = true;
+}
+
+static void submitCbAssertResult(struct UvWriterReq *req, int status)
+{
+    struct result *result = req->data;
+    munit_assert_int(status, ==, result->status);
+    result->done = true;
+}
+
+/* Initialize the fixture's writer. */
+#define INIT(MAX_WRITES)                                                   \
+    do {                                                                   \
+        int _rv;                                                           \
+        _rv = UvWriterInit(&f->writer, &f->loop, f->fd, f->direct_io != 0, \
+                           f->async_io, MAX_WRITES, f->errmsg);            \
+        munit_assert_int(_rv, ==, 0);                                      \
+        f->writer.data = f;                                                \
+        f->closed = false;                                                 \
+    } while (0)
+
+/* Try to initialize the fixture's writer and check that the given error is
+ * returned. */
+#define INIT_ERROR(RV, ERRMSG)                                             \
+    do {                                                                   \
+        int _rv;                                                           \
+        _rv = UvWriterInit(&f->writer, &f->loop, f->fd, f->direct_io != 0, \
+                           f->async_io, 1, f->errmsg);                     \
+        munit_assert_int(_rv, ==, RV);                                     \
+        munit_assert_string_equal(f->errmsg, ERRMSG);                      \
+    } while (0)
+
+/* Close helper. */
+#define CLOSE_SUBMIT                    \
+    munit_assert_false(f->closed);      \
+    UvWriterClose(&f->writer, closeCb); \
+    munit_assert_false(f->closed)
+#define CLOSE_WAIT LOOP_RUN_UNTIL(&f->closed)
+#define CLOSE     \
+    CLOSE_SUBMIT; \
+    CLOSE_WAIT
+
+#define MAKE_BUFS(BUFS, N_BUFS, CONTENT)                               \
+    {                                                                  \
+        int __i;                                                       \
+        BUFS = munit_malloc(sizeof *BUFS * N_BUFS);                    \
+        for (__i = 0; __i < N_BUFS; __i++) {                           \
+            uv_buf_t *__buf = &BUFS[__i];                              \
+            __buf->len = f->block_size;                                \
+            __buf->base = aligned_alloc(f->block_size, f->block_size); \
+            munit_assert_ptr_not_null(__buf->base);                    \
+            memset(__buf->base, CONTENT + __i, __buf->len);            \
+        }                                                              \
+    }
+
+#define DESTROY_BUFS(BUFS, N_BUFS)           \
+    {                                        \
+        int __i;                             \
+        for (__i = 0; __i < N_BUFS; __i++) { \
+            free(BUFS[__i].base);            \
+        }                                    \
+        free(BUFS);                          \
+    }
+
+#define WRITE_REQ(N_BUFS, CONTENT, OFFSET, RV, STATUS)             \
+    struct uv_buf_t *_bufs;                                        \
+    struct UvWriterReq _req;                                       \
+    struct result _result = {STATUS, false};                       \
+    int _rv;                                                       \
+    MAKE_BUFS(_bufs, N_BUFS, CONTENT);                             \
+    _req.data = &_result;                                          \
+    _rv = UvWriterSubmit(&f->writer, &_req, _bufs, N_BUFS, OFFSET, \
+                         submitCbAssertResult);                    \
+    munit_assert_int(_rv, ==, RV);
+
+/* Submit a write request with the given parameters and wait for the operation
+ * to successfully complete. Deallocate BUFS when done.
+ *
+ * N_BUFS is the number of buffers to allocate and write, each of them will have
+ * f->block_size bytes.
+ *
+ * CONTENT must be an unsigned byte value: all bytes of the first buffer will be
+ * filled with that value, all bytes of the second buffer will be filled will
+ * that value plus one, etc.
+ *
+ * OFFSET is the offset at which to write the buffers. */
+#define WRITE(N_BUFS, CONTENT, OFFSET)                                  \
+    do {                                                                \
+        WRITE_REQ(N_BUFS, CONTENT, OFFSET, 0 /* rv */, 0 /* status */); \
+        LOOP_RUN_UNTIL(&_result.done);                                  \
+        DESTROY_BUFS(_bufs, N_BUFS);                                    \
+    } while (0)
+
+/* Submit a write request with the given parameters and wait for the operation
+ * to fail with the given code and message. */
+#define WRITE_FAILURE(N_BUFS, CONTENT, OFFSET, STATUS, ERRMSG)  \
+    do {                                                        \
+        WRITE_REQ(N_BUFS, CONTENT, OFFSET, 0 /* rv */, STATUS); \
+        LOOP_RUN_UNTIL(&_result.done);                          \
+        munit_assert_string_equal(f->writer.errmsg, ERRMSG);    \
+        DESTROY_BUFS(_bufs, N_BUFS);                            \
+    } while (0)
+
+/* Submit a write request with the given parameters, close the writer right
+ * after and assert that the request got canceled. */
+#define WRITE_CLOSE(N_BUFS, CONTENT, OFFSET, STATUS)            \
+    do {                                                        \
+        WRITE_REQ(N_BUFS, CONTENT, OFFSET, 0 /* rv */, STATUS); \
+        CLOSE_SUBMIT;                                           \
+        munit_assert_false(_result.done);                       \
+        LOOP_RUN_UNTIL(&_result.done);                          \
+        DESTROY_BUFS(_bufs, N_BUFS);                            \
+        CLOSE_WAIT;                                             \
+    } while (0)
+
+/* Assert that the content of the test file has the given number of blocks, each
+ * filled with progressive numbers. */
+#define ASSERT_CONTENT(N)                                     \
+    do {                                                      \
+        size_t _size = N * f->block_size;                     \
+        void *_buf = munit_malloc(_size);                     \
+        unsigned _i;                                          \
+        unsigned _j;                                          \
+                                                              \
+        DirReadFile(f->dir, "foo", _buf, _size);              \
+                                                              \
+        for (_i = 0; _i < N; _i++) {                          \
+            char *cursor = (char *)_buf + _i * f->block_size; \
+            for (_j = 0; _j < f->block_size; _j++) {          \
+                munit_assert_int(cursor[_j], ==, _i + 1);     \
+            }                                                 \
+        }                                                     \
+                                                              \
+        free(_buf);                                           \
+    } while (0)
+
+#define N_BLOCKS 5
+
+/******************************************************************************
+ *
+ * Set up and tear down.
+ *
+ *****************************************************************************/
+
+static void *setUpDeps(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = munit_malloc(sizeof *f);
+    char path[UV__PATH_SZ];
+    char errmsg[256];
+    int rv;
+    SET_UP_DIR;
+    SETUP_LOOP;
+    rv = UvFsProbeCapabilities(f->dir, &f->direct_io, &f->async_io,
+                               &f->fallocate, errmsg);
+    munit_assert_int(rv, ==, 0);
+    f->block_size = f->direct_io != 0 ? f->direct_io : 4096;
+    rv = UvOsJoin(f->dir, "foo", path);
+    munit_assert_int(rv, ==, 0);
+    rv = UvOsOpen(path, O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR, &f->fd);
+    munit_assert_int(rv, ==, 0);
+    rv = UvOsFallocate(f->fd, 0, f->block_size * N_BLOCKS);
+    munit_assert_int(rv, ==, 0);
+    return f;
+}
+
+static void tearDownDeps(void *data)
+{
+    struct fixture *f = data;
+    if (f == NULL) {
+        return; /* Was skipped. */
+    }
+    UvOsClose(f->fd);
+    TEAR_DOWN_LOOP;
+    TEAR_DOWN_DIR;
+    free(f);
+}
+
+static void *setUp(const MunitParameter params[], void *user_data)
+{
+    struct fixture *f = setUpDeps(params, user_data);
+    if (f == NULL) {
+        return NULL;
+    }
+    INIT(1);
+    return f;
+}
+
+static void tearDown(void *data)
+{
+    struct fixture *f = data;
+    if (f == NULL) {
+        return; /* Was skipped. */
+    }
+    CLOSE;
+    tearDownDeps(f);
+}
+
+/******************************************************************************
+ *
+ * UvWriterInit
+ *
+ *****************************************************************************/
+
+SUITE(UvWriterInit)
+
+/* The kernel has ran out of available AIO events. */
+TEST(UvWriterInit, noResources, setUpDeps, tearDownDeps, 0, NULL)
+{
+    struct fixture *f = data;
+    aio_context_t ctx = 0;
+    int rv;
+    rv = AioFill(&ctx, 0);
+    if (rv != 0) {
+        return MUNIT_SKIP;
+    }
+    INIT_ERROR(RAFT_TOOMANY, "AIO events user limit exceeded");
+    AioDestroy(ctx);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * UvWriterSubmit
+ *
+ *****************************************************************************/
+
+SUITE(UvWriterSubmit)
+
+TEST(UvWriterSubmit, one, setUp, tearDown, 0, DirAllParams)
+{
+    struct fixture *f = data;
+    SKIP_IF_NO_FIXTURE;
+    WRITE(1 /* n bufs */, 1 /* content */, 0 /* offset */);
+    ASSERT_CONTENT(1);
+    return MUNIT_OK;
+}
+
+/* Write two buffers, one after the other. */
+TEST(UvWriterSubmit, two, setUp, tearDown, 0, DirAllParams)
+{
+    struct fixture *f = data;
+    SKIP_IF_NO_FIXTURE;
+    WRITE(1 /* n bufs */, 1 /* content */, 0 /* offset */);
+    WRITE(1 /* n bufs */, 2 /* content */, f->block_size /* offset */);
+    ASSERT_CONTENT(2);
+    return MUNIT_OK;
+}
+
+/* Write the same block twice. */
+TEST(UvWriterSubmit, twice, setUp, tearDown, 0, DirAllParams)
+{
+    struct fixture *f = data;
+    SKIP_IF_NO_FIXTURE;
+    WRITE(1 /* n bufs */, 0 /* content */, 0 /* offset */);
+    WRITE(1 /* n bufs */, 1 /* content */, 0 /* offset */);
+    ASSERT_CONTENT(1);
+    return MUNIT_OK;
+}
+
+/* Write a vector of buffers. */
+TEST(UvWriterSubmit, vec, setUp, tearDown, 0, DirAllParams)
+{
+    struct fixture *f = data;
+    SKIP_IF_NO_FIXTURE;
+    WRITE(2 /* n bufs */, 1 /* content */, 0 /* offset */);
+    ASSERT_CONTENT(1);
+    return MUNIT_OK;
+}
+
+/* Write a vector of buffers twice. */
+TEST(UvWriterSubmit, vecTwice, setUp, tearDown, 0, DirAllParams)
+{
+    struct fixture *f = data;
+    SKIP_IF_NO_FIXTURE;
+    WRITE(2 /* n bufs */, 1 /* content */, 0 /* offset */);
+    WRITE(2 /* n bufs */, 1 /* content */, 0 /* offset */);
+    ASSERT_CONTENT(2);
+    return MUNIT_OK;
+}
+
+/* Write past the allocated space. */
+TEST(UvWriterSubmit, beyondEOF, setUp, tearDown, 0, DirAllParams)
+{
+    struct fixture *f = data;
+    int i;
+    SKIP_IF_NO_FIXTURE;
+    for (i = 0; i < N_BLOCKS + 1; i++) {
+        WRITE(1 /* n bufs */, i + 1 /* content */,
+              i * f->block_size /* offset */);
+    }
+    ASSERT_CONTENT((N_BLOCKS + 1));
+    return MUNIT_OK;
+}
+
+/* Write two different blocks concurrently. */
+TEST(UvWriterSubmit, concurrent, NULL, NULL, 0, DirAllParams)
+{
+    return MUNIT_SKIP; /* TODO: tests stop responding */
+}
+
+/* Write the same block concurrently. */
+TEST(UvWriterSubmit, concurrentSame, NULL, NULL, 0, DirAllParams)
+{
+    return MUNIT_SKIP; /* TODO: tests stop responding */
+}
+
+/* There are not enough resources to create an AIO context to perform the
+ * write. */
+TEST(UvWriterSubmit, noResources, setUpDeps, tearDown, 0, DirNoAioParams)
+{
+    struct fixture *f = data;
+    aio_context_t ctx = 0;
+    int rv;
+    SKIP_IF_NO_FIXTURE;
+    INIT(2);
+    rv = AioFill(&ctx, 0);
+    if (rv != 0) {
+        return MUNIT_SKIP;
+    }
+    WRITE_FAILURE(1, 0, 0, RAFT_TOOMANY, "AIO events user limit exceeded");
+    AioDestroy(ctx);
+    return MUNIT_OK;
+}
+
+/******************************************************************************
+ *
+ * UvWriterSubmit
+ *
+ *****************************************************************************/
+
+SUITE(UvWriterClose)
+
+/* Close with an inflight write running in the threadpool. */
+TEST(UvWriterClose, threadpool, setUp, tearDownDeps, 0, DirNoAioParams)
+{
+    struct fixture *f = data;
+    SKIP_IF_NO_FIXTURE;
+    WRITE_CLOSE(1, 0, 0, 0);
+    return MUNIT_OK;
+}
+
+/* Close with an inflight AIO write . */
+TEST(UvWriterClose, aio, setUp, tearDownDeps, 0, DirAioParams)
+{
+    struct fixture *f = data;
+    SKIP_IF_NO_FIXTURE;
+    WRITE_CLOSE(1, 0, 0, RAFT_CANCELED);
+    return MUNIT_OK;
+}
diff --git a/test/unit/ext/test_uv.c b/test/unit/ext/test_uv.c
index 7bb905d44..aad32fb47 100644
--- a/test/unit/ext/test_uv.c
+++ b/test/unit/ext/test_uv.c
@@ -1,9 +1,9 @@
-#include <raft.h>
 #include <uv.h>
 
 #include <unistd.h>
 
 #include "../../../src/lib/transport.h"
+#include "../../../src/raft.h"
 #include "../../lib/endpoint.h"
 #include "../../lib/runner.h"
 #include "../../lib/uv.h"
diff --git a/test/unit/test_conn.c b/test/unit/test_conn.c
index 93e4a475c..769c52945 100644
--- a/test/unit/test_conn.c
+++ b/test/unit/test_conn.c
@@ -1,8 +1,3 @@
-#include <raft.h>
-#include <raft/uv.h>
-
-#include "../../include/dqlite.h"
-
 #include "../lib/client.h"
 #include "../lib/config.h"
 #include "../lib/heap.h"
@@ -17,6 +12,7 @@
 #include "../../src/conn.h"
 #include "../../src/gateway.h"
 #include "../../src/lib/transport.h"
+#include "../../src/raft.h"
 #include "../../src/transport.h"
 
 TEST_MODULE(conn);
diff --git a/test/unit/test_vfs.c b/test/unit/test_vfs.c
index 92e5f3139..cfd103d6d 100644
--- a/test/unit/test_vfs.c
+++ b/test/unit/test_vfs.c
@@ -1,7 +1,6 @@
 #include <errno.h>
 #include <unistd.h>
 
-#include <raft.h>
 #include <sqlite3.h>
 
 #include "../../include/dqlite.h"
@@ -13,6 +12,7 @@
 #include "../lib/sqlite.h"
 
 #include "../../src/format.h"
+#include "../../src/raft.h"
 #include "../../src/vfs.h"
 
 static char *bools[] = {"0", "1", NULL};