diff --git a/.github/workflows/build-cachelib-centos.yml b/.github/workflows/build-cachelib-centos-long.yml
similarity index 86%
rename from .github/workflows/build-cachelib-centos.yml
rename to .github/workflows/build-cachelib-centos-long.yml
index 3b071a186a..92165f603b 100644
--- a/.github/workflows/build-cachelib-centos.yml
+++ b/.github/workflows/build-cachelib-centos-long.yml
@@ -1,7 +1,8 @@
name: build-cachelib-centos-latest
on:
schedule:
- - cron: '30 5 * * 1,4'
+ - cron: '0 7 * * *'
+
jobs:
build-cachelib-centos8-latest:
name: "CentOS/latest - Build CacheLib with all dependencies"
@@ -33,3 +34,6 @@ jobs:
uses: actions/checkout@v2
- name: "build CacheLib using build script"
run: ./contrib/build.sh -j -v -T
+ - name: "run tests"
+ timeout-minutes: 60
+ run: cd opt/cachelib/tests && ../../../run_tests.sh long
diff --git a/.github/workflows/build-cachelib-debian.yml b/.github/workflows/build-cachelib-debian.yml
index a2ae44a569..5bc3ad3c70 100644
--- a/.github/workflows/build-cachelib-debian.yml
+++ b/.github/workflows/build-cachelib-debian.yml
@@ -1,7 +1,8 @@
name: build-cachelib-debian-10
on:
schedule:
- - cron: '30 5 * * 2,6'
+ - cron: '30 5 * * 0,3'
+
jobs:
build-cachelib-debian-10:
name: "Debian/Buster - Build CacheLib with all dependencies"
@@ -37,3 +38,6 @@ jobs:
uses: actions/checkout@v2
- name: "build CacheLib using build script"
run: ./contrib/build.sh -j -v -T
+ - name: "run tests"
+ timeout-minutes: 60
+ run: cd opt/cachelib/tests && ../../../run_tests.sh
diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml
new file mode 100644
index 0000000000..f73339e0d9
--- /dev/null
+++ b/.github/workflows/build-cachelib-docker.yml
@@ -0,0 +1,49 @@
+name: build-cachelib-docker
+on:
+ push:
+ pull_request:
+
+jobs:
+ build-cachelib-docker:
+ name: "CentOS/latest - Build CacheLib with all dependencies"
+ runs-on: ubuntu-latest
+ env:
+ REPO: cachelib
+ GITHUB_REPO: pmem/CacheLib
+ CONTAINER_REG: ghcr.io/pmem/cachelib
+ CONTAINER_REG_USER: ${{ secrets.GH_CR_USER }}
+ CONTAINER_REG_PASS: ${{ secrets.GH_CR_PAT }}
+ FORCE_IMAGE_ACTION: ${{ secrets.FORCE_IMAGE_ACTION }}
+ HOST_WORKDIR: ${{ github.workspace }}
+ WORKDIR: docker
+ IMG_VER: devel
+ strategy:
+ matrix:
+ CONFIG: ["OS=centos OS_VER=8streams PUSH_IMAGE=1"]
+ steps:
+ - name: "System Information"
+ run: |
+ echo === uname ===
+ uname -a
+ echo === /etc/os-release ===
+ cat /etc/os-release
+ echo === df -hl ===
+ df -hl
+ echo === free -h ===
+ free -h
+ echo === top ===
+ top -b -n1 -1 -Eg || timeout 1 top -b -n1
+ echo === env ===
+ env
+ echo === gcc -v ===
+ gcc -v
+ - name: "checkout sources"
+ uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Pull the image or rebuild and push it
+ run: cd $WORKDIR && ${{ matrix.CONFIG }} ./pull-or-rebuild-image.sh $FORCE_IMAGE_ACTION
+
+ - name: Run the build
+ run: cd $WORKDIR && ${{ matrix.CONFIG }} ./build.sh
diff --git a/.github/workflows/build-cachelib.yml b/.github/workflows/build-cachelib.yml
deleted file mode 100644
index 15161c40e0..0000000000
--- a/.github/workflows/build-cachelib.yml
+++ /dev/null
@@ -1,147 +0,0 @@
-# NOTES:
-# 1. While Github-Actions enables cache of dependencies,
-# Facebook's projects (folly,fizz,wangle,fbthrift)
-# are fast-moving targets - so we always checkout the latest version
-# (as opposed to using gitactions cache, which is recommended in the
-# documentation).
-#
-# 2. Using docker containers to build on CentOS and Debian,
-# Specifically CentOS v8.1.1911 as that
-# version is closest to Facebook's internal dev machines.
-#
-# 3. When using docker containers we install 'sudo',
-# as the docker images are typically very minimal and without
-# 'sudo', while the ./contrib/ scripts use sudo.
-#
-# 4. When using the docker containers we install 'git'
-# BEFORE getting the CacheLib source code (with the 'checkout' action).
-# Otherwise, the 'checkout@v2' action script falls back to downloading
-# the git repository files only, without the ".git" directory.
-# We need the ".git" directory to updating the git-submodules
-# (folly/wangle/fizz/fbthrift). See:
-# https://github.com/actions/checkout/issues/126#issuecomment-570288731
-#
-# 5. To reduce less-critical (and yet frequent) rebuilds, the jobs
-# check the author of the commit, and SKIP the build if
-# the author is "svcscm". These commits are automatic updates
-# for the folly/fbthrift git-submodules, and can happen several times a day.
-# While there is a possiblity that updating the git-submodules breaks
-# CacheLib, it is less likely, and will be detected once an actual
-# code change commit triggers a full build.
-# e.g. https://github.com/facebookincubator/CacheLib/commit/9372a82190dd71a6e2bcb668828cfed9d1bd25c1
-#
-# 6. The 'if' condition checking the author name of the commit (see #5 above)
-# uses github actions metadata variable:
-# 'github.event.head_commit.author.name'
-# GitHub have changed in the past the metadata structure and broke
-# such conditions. If you need to debug the metadata values,
-# see the "dummy-show-github-event" job below.
-# E.g. https://github.blog/changelog/2019-10-16-changes-in-github-actions-push-event-payload/
-# As of Jan-2021, the output is:
-# {
-# "author": {
-# "email": "mimi@moo.moo",
-# "name": "mimi"
-# },
-# "committer": {
-# "email": "assafgordon@gmail.com",
-# "name": "Assaf Gordon",
-# "username": "agordon"
-# },
-# "distinct": true,
-# "id": "6c3aab0970f4a07cc2af7658756a6ef9d82f3276",
-# "message": "gitactions: test",
-# "timestamp": "2021-01-26T11:11:57-07:00",
-# "tree_id": "741cd1cb802df84362a51e5d01f28788845d08b7",
-# "url": "https://github.com/agordon/CacheLib/commit/6c3aab0970f4a07cc2af7658756a6ef9d82f3276"
-# }
-#
-# 7. When checking the commit's author name, we use '...author.name',
-# NOT '...author.username' - because the 'svcscm' author does not
-# have a github username (see the 'mimi' example above).
-#
-
-name: build-cachelib
-on: [push]
-jobs:
- dummy-show-github-event:
- name: "Show GitHub Action event.head_commit variable"
- runs-on: ubuntu-latest
- steps:
- - name: "GitHub Variable Content"
- env:
- CONTENT: ${{ toJSON(github.event.head_commit) }}
- run: echo "$CONTENT"
-
-
- build-cachelib-centos8-1-1911:
- if: "!contains(github.event.head_commit.author.name, 'svcscm')"
- name: "CentOS/8.1.1911 - Build CacheLib with all dependencies"
- runs-on: ubuntu-latest
- # Docker container image name
- container: "centos:8.1.1911"
- steps:
- - name: "update packages"
- # stock centos has a problem with CMAKE, fails with:
- # "cmake: symbol lookup error: cmake: undefined symbol: archive_write_add_filter_zstd"
- # updating solves it
- run: dnf update -y
- - name: "install sudo,git"
- run: dnf install -y sudo git cmake gcc
- - name: "System Information"
- run: |
- echo === uname ===
- uname -a
- echo === /etc/os-release ===
- cat /etc/os-release
- echo === df -hl ===
- df -hl
- echo === free -h ===
- free -h
- echo === top ===
- top -b -n1 -1 -Eg || timeout 1 top -b -n1
- echo === env ===
- env
- echo === gcc -v ===
- gcc -v
- - name: "checkout sources"
- uses: actions/checkout@v2
- - name: "Install Prerequisites"
- run: ./contrib/build.sh -S -B
- - name: "Test: update-submodules"
- run: ./contrib/update-submodules.sh
- - name: "Install dependency: zstd"
- run: ./contrib/build-package.sh -j -v -i zstd
- - name: "Install dependency: googleflags"
- run: ./contrib/build-package.sh -j -v -i googleflags
- - name: "Install dependency: googlelog"
- run: ./contrib/build-package.sh -j -v -i googlelog
- - name: "Install dependency: googletest"
- run: ./contrib/build-package.sh -j -v -i googletest
- - name: "Install dependency: sparsemap"
- run: ./contrib/build-package.sh -j -v -i sparsemap
- - name: "Install dependency: fmt"
- run: ./contrib/build-package.sh -j -v -i fmt
- - name: "Install dependency: folly"
- run: ./contrib/build-package.sh -j -v -i folly
- - name: "Install dependency: fizz"
- run: ./contrib/build-package.sh -j -v -i fizz
- - name: "Install dependency: wangle"
- run: ./contrib/build-package.sh -j -v -i wangle
- - name: "Install dependency: fbthrift"
- run: ./contrib/build-package.sh -j -v -i fbthrift
- - name: "build CacheLib"
- # Build cachelib in debug mode (-d) and with all tests (-t)
- run: ./contrib/build-package.sh -j -v -i -d -t cachelib
- - uses: actions/upload-artifact@v2
- if: failure()
- with:
- name: cachelib-cmake-logs
- path: |
- build-cachelib/CMakeFiles/*.log
- build-cachelib/CMakeCache.txt
- build-cachelib/Makefile
- build-cachelib/**/Makefile
- if-no-files-found: warn
- retention-days: 1
-
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
index 99370135ff..9f76f8ab6c 100644
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@@ -1,6 +1,6 @@
# From: https://github.com/marketplace/actions/clang-format-check#multiple-paths
name: clang-format Check
-on: [pull_request]
+on: []
jobs:
formatting-check:
name: Formatting Check
diff --git a/MultiTierDataMovement.md b/MultiTierDataMovement.md
new file mode 100644
index 0000000000..d116f210a0
--- /dev/null
+++ b/MultiTierDataMovement.md
@@ -0,0 +1,117 @@
+# Background Data Movement
+
+In order to reduce the number of online evictions and support asynchronous
+promotion - we have added two periodic workers to handle eviction and promotion.
+
+The diagram below shows a simplified version of how the background evictor
+thread (green) is integrated to the CacheLib architecture.
+
+
+
+
+
+## Synchronous Eviction and Promotion
+
+- `disableEvictionToMemory`: Disables eviction to memory (item is always evicted to NVMe or removed
+on eviction)
+
+## Background Evictors
+
+The background evictors scan each class to see if there are objects to move the next (lower)
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters.
+
+- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default
+the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also,
+the background evictor thead will be woken up everytime there is a failed allocation (from
+a request handling thread) and the current percentage of free memory for the
+AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter
+not as important when there are many allocations occuring from request handling threads.
+
+- `evictorThreads`: The number of background evictors to run - each thread is a assigned
+a set of AllocationClasses to scan and evict objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses.
+The default is 1.
+
+- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads.
+
+- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any
+candidates)
+
+- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch`
+but it specifies how many candidates will be taken into consideration, not the actual number of items to evict.
+This option can be used to configure duration of critical section on LRU lock.
+
+
+### FreeThresholdStrategy (default)
+
+- `lowEvictionAcWatermark`: Triggers background eviction thread to run
+when this percentage of the AllocationClass is free.
+The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`.
+
+- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this
+percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we
+don't set this above `10`.
+
+
+## Background Promoters
+
+The background promotes scan each class to see if there are objects to move to a lower
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters.
+
+- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default
+the background promoter threads will wake up every 10 ms to scan the AllocationClasses for
+objects to promote.
+
+- `promoterThreads`: The number of background promoters to run - each thread is a assigned
+a set of AllocationClasses to scan and promote objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`.
+
+- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads.
+
+- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any
+candidates)
+
+- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since
+we won't need to modify the data when a user is done with the data. Therefore, for a short time
+the data could reside in both tiers until it is evicted from its current tier. The default is to
+not allow this (0). Setting the value to 100 will enable duplicate elements in tiers.
+
+### Background Promotion Strategy (only one currently)
+
+- `promotionAcWatermark`: Promote items if there is at least this
+percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects
+to that tier. The objects are chosen from the head of the LRU. The default is `4.0`.
+This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`.
+- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to
+`maxEvictionBatch`. It's value should be lower to decrease contention on hot items.
+
+## Allocation policies
+
+- `maxAcAllocationWatermark`: Item is always allocated in topmost tier if at least this
+percentage of the AllocationClass is free.
+- `minAcAllocationWatermark`: Item is always allocated in bottom tier if only this percent
+of the AllocationClass is free. If percentage of free AllocationClasses is between `maxAcAllocationWatermark`
+and `minAcAllocationWatermark`: then extra checks (described below) are performed to decide where to put the element.
+
+By default, allocation will always be performed from the upper tier.
+
+- `acTopTierEvictionWatermark`: If there is less that this percent of free memory in topmost tier, cachelib will attempt to evict from top tier. This option takes precedence before allocationWatermarks.
+
+### Extra policies (used only when percentage of free AllocationClasses is between `maxAcAllocationWatermark`
+and `minAcAllocationWatermark`)
+- `sizeThresholdPolicy`: If item is smaller than this value, always allocate it in upper tier.
+- `defaultTierChancePercentage`: Change (0-100%) of allocating item in top tier
+
+## MMContainer options
+
+- `lruInsertionPointSpec`: Can be set per tier when LRU2Q is used. Determines where new items are
+inserted. 0 = insert to hot queue, 1 = insert to warm queue, 2 = insert to cold queue
+- `markUsefulChance`: Per-tier, determines chance of moving item to the head of LRU on access
diff --git a/cachelib-background-evictor.png b/cachelib-background-evictor.png
new file mode 100644
index 0000000000..571db128b2
Binary files /dev/null and b/cachelib-background-evictor.png differ
diff --git a/cachelib/allocator/BackgroundEvictor-inl.h b/cachelib/allocator/BackgroundEvictor-inl.h
new file mode 100644
index 0000000000..9cec5d3930
--- /dev/null
+++ b/cachelib/allocator/BackgroundEvictor-inl.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace facebook {
+namespace cachelib {
+
+
+template
+BackgroundEvictor::BackgroundEvictor(Cache& cache,
+ std::shared_ptr strategy)
+ : cache_(cache),
+ strategy_(strategy)
+{
+}
+
+template
+BackgroundEvictor::~BackgroundEvictor() { stop(std::chrono::seconds(0)); }
+
+template
+void BackgroundEvictor::work() {
+ try {
+ checkAndRun();
+ } catch (const std::exception& ex) {
+ XLOGF(ERR, "BackgroundEvictor interrupted due to exception: {}", ex.what());
+ }
+}
+
+template
+void BackgroundEvictor::setAssignedMemory(std::vector> &&assignedMemory)
+{
+ XLOG(INFO, "Class assigned to background worker:");
+ for (auto [tid, pid, cid] : assignedMemory) {
+ XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid);
+ }
+
+ mutex.lock_combine([this, &assignedMemory]{
+ this->assignedMemory_ = std::move(assignedMemory);
+ });
+}
+
+// Look for classes that exceed the target memory capacity
+// and return those for eviction
+template
+void BackgroundEvictor::checkAndRun() {
+ auto assignedMemory = mutex.lock_combine([this]{
+ return assignedMemory_;
+ });
+
+ unsigned int evictions = 0;
+ std::set classes{};
+ auto batches = strategy_->calculateBatchSizes(cache_,assignedMemory);
+
+ for (size_t i = 0; i < batches.size(); i++) {
+ const auto [tid, pid, cid] = assignedMemory[i];
+ const auto batch = batches[i];
+
+ classes.insert(cid);
+ const auto& mpStats = cache_.getPoolByTid(pid,tid).getStats();
+
+ if (!batch) {
+ continue;
+ }
+
+ stats.evictionSize.add(batch * mpStats.acStats.at(cid).allocSize);
+
+ //try evicting BATCH items from the class in order to reach free target
+ auto evicted =
+ BackgroundEvictorAPIWrapper::traverseAndEvictItems(cache_,
+ tid,pid,cid,batch);
+ evictions += evicted;
+ evictions_per_class_[tid][pid][cid] += evicted;
+ }
+
+ stats.numTraversals.inc();
+ stats.numEvictedItems.add(evictions);
+ stats.totalClasses.add(classes.size());
+}
+
+template
+BackgroundEvictionStats BackgroundEvictor::getStats() const noexcept {
+ BackgroundEvictionStats evicStats;
+ evicStats.numEvictedItems = stats.numEvictedItems.get();
+ evicStats.runCount = stats.numTraversals.get();
+ evicStats.evictionSize = stats.evictionSize.get();
+ evicStats.totalClasses = stats.totalClasses.get();
+
+ return evicStats;
+}
+
+template
+std::map>>
+BackgroundEvictor::getClassStats() const noexcept {
+ return evictions_per_class_;
+}
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/BackgroundEvictor.h b/cachelib/allocator/BackgroundEvictor.h
new file mode 100644
index 0000000000..7583732127
--- /dev/null
+++ b/cachelib/allocator/BackgroundEvictor.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include
+#include
+
+#include "cachelib/allocator/CacheStats.h"
+#include "cachelib/common/PeriodicWorker.h"
+#include "cachelib/allocator/BackgroundEvictorStrategy.h"
+#include "cachelib/common/AtomicCounter.h"
+
+
+namespace facebook {
+namespace cachelib {
+
+// wrapper that exposes the private APIs of CacheType that are specifically
+// needed for the eviction.
+template
+struct BackgroundEvictorAPIWrapper {
+
+ static size_t traverseAndEvictItems(C& cache,
+ unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) {
+ return cache.traverseAndEvictItems(tid,pid,cid,batch);
+ }
+};
+
+struct BackgroundEvictorStats {
+ // items evicted
+ AtomicCounter numEvictedItems{0};
+
+ // traversals
+ AtomicCounter numTraversals{0};
+
+ // total class size
+ AtomicCounter totalClasses{0};
+
+ // item eviction size
+ AtomicCounter evictionSize{0};
+};
+
+// Periodic worker that evicts items from tiers in batches
+// The primary aim is to reduce insertion times for new items in the
+// cache
+template
+class BackgroundEvictor : public PeriodicWorker {
+ public:
+ using Cache = CacheT;
+ // @param cache the cache interface
+ // @param target_free the target amount of memory to keep free in
+ // this tier
+ // @param tier id memory tier to perform eviction on
+ BackgroundEvictor(Cache& cache,
+ std::shared_ptr strategy);
+
+ ~BackgroundEvictor() override;
+
+ BackgroundEvictionStats getStats() const noexcept;
+ std::map>> getClassStats() const noexcept;
+
+ void setAssignedMemory(std::vector> &&assignedMemory);
+
+ private:
+ std::map>> evictions_per_class_;
+
+ // cache allocator's interface for evicting
+
+ using Item = typename Cache::Item;
+
+ Cache& cache_;
+ std::shared_ptr strategy_;
+
+ // implements the actual logic of running the background evictor
+ void work() override final;
+ void checkAndRun();
+
+ BackgroundEvictorStats stats;
+
+ std::vector> assignedMemory_;
+ folly::DistributedMutex mutex;
+};
+} // namespace cachelib
+} // namespace facebook
+
+#include "cachelib/allocator/BackgroundEvictor-inl.h"
diff --git a/cachelib/allocator/BackgroundEvictorStrategy.h b/cachelib/allocator/BackgroundEvictorStrategy.h
new file mode 100644
index 0000000000..1d05a801bb
--- /dev/null
+++ b/cachelib/allocator/BackgroundEvictorStrategy.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/Cache.h"
+
+namespace facebook {
+namespace cachelib {
+
+// Base class for background eviction strategy.
+class BackgroundEvictorStrategy {
+
+public:
+ virtual std::vector calculateBatchSizes(const CacheBase& cache,
+ std::vector> acVec) = 0;
+};
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/BackgroundPromoter-inl.h b/cachelib/allocator/BackgroundPromoter-inl.h
new file mode 100644
index 0000000000..daa6ae0a93
--- /dev/null
+++ b/cachelib/allocator/BackgroundPromoter-inl.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace facebook {
+namespace cachelib {
+
+
+template
+BackgroundPromoter::BackgroundPromoter(Cache& cache,
+ std::shared_ptr strategy)
+ : cache_(cache),
+ strategy_(strategy)
+{
+}
+
+template
+BackgroundPromoter::~BackgroundPromoter() { stop(std::chrono::seconds(0)); }
+
+template
+void BackgroundPromoter::work() {
+ try {
+ checkAndRun();
+ } catch (const std::exception& ex) {
+ XLOGF(ERR, "BackgroundPromoter interrupted due to exception: {}", ex.what());
+ }
+}
+
+template
+void BackgroundPromoter::setAssignedMemory(std::vector> &&assignedMemory)
+{
+ XLOG(INFO, "Class assigned to background worker:");
+ for (auto [tid, pid, cid] : assignedMemory) {
+ XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid);
+ }
+
+ mutex.lock_combine([this, &assignedMemory]{
+ this->assignedMemory_ = std::move(assignedMemory);
+ });
+}
+
+// Look for classes that exceed the target memory capacity
+// and return those for eviction
+template
+void BackgroundPromoter::checkAndRun() {
+ auto assignedMemory = mutex.lock_combine([this]{
+ return assignedMemory_;
+ });
+
+ unsigned int promotions = 0;
+ std::set classes{};
+
+ auto batches = strategy_->calculateBatchSizes(cache_,assignedMemory);
+
+ for (size_t i = 0; i < batches.size(); i++) {
+ const auto [tid, pid, cid] = assignedMemory[i];
+ const auto batch = batches[i];
+
+
+ classes.insert(cid);
+ const auto& mpStats = cache_.getPoolByTid(pid,tid).getStats();
+ if (!batch) {
+ continue;
+ }
+
+ // stats.promotionsize.add(batch * mpStats.acStats.at(cid).allocSize);
+
+ //try evicting BATCH items from the class in order to reach free target
+ auto promoted =
+ BackgroundPromoterAPIWrapper::traverseAndPromoteItems(cache_,
+ tid,pid,cid,batch);
+ promotions += promoted;
+ promotions_per_class_[tid][pid][cid] += promoted;
+ }
+
+ stats.numTraversals.inc();
+ stats.numPromotedItems.add(promotions);
+ // stats.totalClasses.add(classes.size());
+}
+
+template
+BackgroundPromotionStats BackgroundPromoter::getStats() const noexcept {
+ BackgroundPromotionStats promoStats;
+ promoStats.numPromotedItems = stats.numPromotedItems.get();
+ promoStats.runCount = stats.numTraversals.get();
+
+ return promoStats;
+}
+
+template
+std::map>>
+BackgroundPromoter::getClassStats() const noexcept {
+ return promotions_per_class_;
+}
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/BackgroundPromoter.h b/cachelib/allocator/BackgroundPromoter.h
new file mode 100644
index 0000000000..04e0e7d187
--- /dev/null
+++ b/cachelib/allocator/BackgroundPromoter.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include
+#include
+
+#include "cachelib/allocator/CacheStats.h"
+#include "cachelib/common/PeriodicWorker.h"
+#include "cachelib/allocator/BackgroundEvictorStrategy.h"
+#include "cachelib/common/AtomicCounter.h"
+
+
+namespace facebook {
+namespace cachelib {
+
+// wrapper that exposes the private APIs of CacheType that are specifically
+// needed for the promotion.
+template
+struct BackgroundPromoterAPIWrapper {
+
+ static size_t traverseAndPromoteItems(C& cache,
+ unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) {
+ return cache.traverseAndPromoteItems(tid,pid,cid,batch);
+ }
+};
+
+struct BackgroundPromoterStats {
+ // items evicted
+ AtomicCounter numPromotedItems{0};
+
+ // traversals
+ AtomicCounter numTraversals{0};
+
+ // total class size
+ AtomicCounter totalClasses{0};
+
+ // item eviction size
+ AtomicCounter promotionSize{0};
+};
+
+template
+class BackgroundPromoter : public PeriodicWorker {
+ public:
+ using Cache = CacheT;
+ // @param cache the cache interface
+ // @param target_free the target amount of memory to keep free in
+ // this tier
+ // @param tier id memory tier to perform promotin from
+ BackgroundPromoter(Cache& cache,
+ std::shared_ptr strategy);
+ // TODO: use separate strategy for eviction and promotion
+
+ ~BackgroundPromoter() override;
+
+ // TODO
+ BackgroundPromotionStats getStats() const noexcept;
+ std::map>> getClassStats() const noexcept;
+
+ void setAssignedMemory(std::vector> &&assignedMemory);
+
+ private:
+ std::map>> promotions_per_class_;
+
+ // cache allocator's interface for evicting
+
+ using Item = typename Cache::Item;
+
+ Cache& cache_;
+ std::shared_ptr strategy_;
+
+ // implements the actual logic of running the background evictor
+ void work() override final;
+ void checkAndRun();
+
+ BackgroundPromoterStats stats;
+
+ std::vector> assignedMemory_;
+ folly::DistributedMutex mutex;
+};
+} // namespace cachelib
+} // namespace facebook
+
+#include "cachelib/allocator/BackgroundPromoter-inl.h"
diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt
index 0c19c720d8..8dc0166ecf 100644
--- a/cachelib/allocator/CMakeLists.txt
+++ b/cachelib/allocator/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library (cachelib_allocator
CCacheManager.cpp
ContainerTypes.cpp
FreeMemStrategy.cpp
+ FreeThresholdStrategy.cpp
HitsPerSlabStrategy.cpp
LruTailAgeStrategy.cpp
MarginalHitsOptimizeStrategy.cpp
@@ -81,6 +82,7 @@ if (BUILD_TESTS)
${DATASTRUCT_TESTS_THRIFT_FILES}
./nvmcache/tests/NvmTestBase.cpp
./memory/tests/TestBase.cpp
+ ../common/TestUtils.cpp
)
add_dependencies(allocator_test_support thrift_generated_files)
target_link_libraries (allocator_test_support PUBLIC
@@ -116,8 +118,11 @@ if (BUILD_TESTS)
add_test (tests/ChainedHashTest.cpp)
add_test (tests/AllocatorResizeTypeTest.cpp)
add_test (tests/AllocatorHitStatsTypeTest.cpp)
+ add_test (tests/AllocatorMemoryTiersTest.cpp)
+ add_test (tests/MemoryTiersTest.cpp)
add_test (tests/MultiAllocatorTest.cpp)
add_test (tests/NvmAdmissionPolicyTest.cpp)
+ add_test (tests/CacheAllocatorConfigTest.cpp)
add_test (nvmcache/tests/NvmItemTests.cpp)
add_test (nvmcache/tests/InFlightPutsTest.cpp)
add_test (nvmcache/tests/TombStoneTests.cpp)
diff --git a/cachelib/allocator/Cache.cpp b/cachelib/allocator/Cache.cpp
index 0e812fb10e..7f6bfe737c 100644
--- a/cachelib/allocator/Cache.cpp
+++ b/cachelib/allocator/Cache.cpp
@@ -23,6 +23,12 @@
namespace facebook {
namespace cachelib {
+CacheBase::CacheBase(unsigned numTiers): numTiers_(numTiers) {}
+
+unsigned CacheBase::getNumTiers() const {
+ return numTiers_;
+}
+
void CacheBase::setRebalanceStrategy(
PoolId pid, std::shared_ptr strategy) {
std::unique_lock l(lock_);
diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index a737074ac6..f021eb0aaa 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -74,7 +74,7 @@ enum class DestructorContext {
// A base class of cache exposing members and status agnostic of template type.
class CacheBase {
public:
- CacheBase() = default;
+ CacheBase(unsigned numTiers = 1);
virtual ~CacheBase() = default;
// Movable but not copyable
@@ -83,6 +83,9 @@ class CacheBase {
CacheBase(CacheBase&&) = default;
CacheBase& operator=(CacheBase&&) = default;
+ // TODO: come up with some reasonable number
+ static constexpr unsigned kMaxTiers = 2;
+
// Get a string referring to the cache name for this cache
virtual const std::string getCacheName() const = 0;
@@ -90,6 +93,12 @@ class CacheBase {
//
// @param poolId The pool id to query
virtual const MemoryPool& getPool(PoolId poolId) const = 0;
+
+ // Get the reference to a memory pool using a tier id, for stats purposes
+ //
+ // @param poolId The pool id to query
+ // @param tierId The tier of the pool id
+ virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0;
// Get Pool specific stats (regular pools). This includes stats from the
// Memory Pool and also the cache.
@@ -97,6 +106,9 @@ class CacheBase {
// @param poolId the pool id
virtual PoolStats getPoolStats(PoolId poolId) const = 0;
+ virtual AllocationClassBaseStat getAllocationClassStats(
+ TierId, PoolId pid, ClassId cid) const = 0;
+
// @param poolId the pool id
virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0;
@@ -271,6 +283,10 @@ class CacheBase {
// @return The number of slabs that were actually reclaimed (<= numSlabs)
virtual unsigned int reclaimSlabs(PoolId id, size_t numSlabs) = 0;
+ unsigned getNumTiers() const;
+
+ unsigned numTiers_ = 1;
+
// Protect 'poolRebalanceStragtegies_' and `poolResizeStrategies_`
// and `poolOptimizeStrategy_`
mutable std::mutex lock_;
diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index a512ed4b6b..fa29ca47a4 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -16,23 +16,24 @@
#pragma once
+#include
+
namespace facebook {
namespace cachelib {
template
CacheAllocator::CacheAllocator(Config config)
- : isOnShm_{config.memMonitoringEnabled()},
+ : CacheBase(config.getMemoryTierConfigs().size()),
+ memoryTierConfigs(config.getMemoryTierConfigs()),
+ isOnShm_{config.memMonitoringEnabled()},
config_(config.validate()),
- tempShm_(isOnShm_ ? std::make_unique(config_.size)
+ tempShm_(isOnShm_ ? std::make_unique(
+ config_.getCacheSize())
: nullptr),
- allocator_(isOnShm_ ? std::make_unique(
- getAllocatorConfig(config_),
- tempShm_->getAddr(),
- config_.size)
- : std::make_unique(
- getAllocatorConfig(config_), config_.size)),
- compactCacheManager_(std::make_unique(*allocator_)),
+ allocator_(createPrivateAllocator()),
+ compactCacheManager_(std::make_unique(*allocator_[0] /* TODO */)),
compressor_(createPtrCompressor()),
+ mmContainers_(numTiers_),
accessContainer_(std::make_unique(
config_.accessConfig,
compressor_,
@@ -43,21 +44,68 @@ CacheAllocator::CacheAllocator(Config config)
[this](Item* it) -> ItemHandle { return acquire(it); })),
chainedItemLocks_(config_.chainedItemsLockPower,
std::make_shared()),
- cacheCreationTime_{util::getCurrentTimeSec()},
- nvmCacheState_{config_.cacheDir, config_.isNvmCacheEncryptionEnabled(),
- config_.isNvmCacheTruncateAllocSizeEnabled()} {
+ movesMap_(kShards),
+ moveLock_(kShards),
+ cacheCreationTime_{util::getCurrentTimeSec()} {
+
+ if (numTiers_ > 1 || std::holds_alternative(
+ memoryTierConfigs[0].getShmTypeOpts())) {
+ throw std::runtime_error(
+ "Using custom memory tier or using more than one tier is only "
+ "supported for Shared Memory.");
+ }
initCommon(false);
}
+template
+std::vector>
+CacheAllocator::createPrivateAllocator() {
+ std::vector> allocators;
+
+ if (isOnShm_)
+ allocators.emplace_back(std::make_unique(
+ getAllocatorConfig(config_),
+ tempShm_->getAddr(),
+ config_.size));
+ else
+ allocators.emplace_back(std::make_unique(
+ getAllocatorConfig(config_), config_.size));
+
+ return allocators;
+}
+
+template
+std::vector>
+CacheAllocator::createAllocators() {
+ std::vector> allocators;
+ for (int tid = 0; tid < numTiers_; tid++) {
+ allocators.emplace_back(createNewMemoryAllocator(tid));
+ }
+ return allocators;
+}
+
+template
+std::vector>
+CacheAllocator::restoreAllocators() {
+ std::vector> allocators;
+ for (int tid = 0; tid < numTiers_; tid++) {
+ allocators.emplace_back(restoreMemoryAllocator(tid));
+ }
+ return allocators;
+}
+
template
CacheAllocator::CacheAllocator(SharedMemNewT, Config config)
- : isOnShm_{true},
+ : CacheBase(config.getMemoryTierConfigs().size()),
+ memoryTierConfigs(config.getMemoryTierConfigs()),
+ isOnShm_{true},
config_(config.validate()),
shmManager_(
- std::make_unique(config_.cacheDir, config_.usePosixShm)),
- allocator_(createNewMemoryAllocator()),
- compactCacheManager_(std::make_unique(*allocator_)),
+ std::make_unique(config_.cacheDir, config_.isUsingPosixShm())),
+ allocator_(createAllocators()),
+ compactCacheManager_(std::make_unique(*allocator_[0] /* TODO */)),
compressor_(createPtrCompressor()),
+ mmContainers_(numTiers_),
accessContainer_(std::make_unique(
config_.accessConfig,
shmManager_
@@ -65,7 +113,8 @@ CacheAllocator::CacheAllocator(SharedMemNewT, Config config)
AccessContainer::getRequiredSize(
config_.accessConfig.getNumBuckets()),
nullptr,
- ShmSegmentOpts(config_.accessConfig.getPageSize()))
+ ShmSegmentOpts(config_.accessConfig.getPageSize(),
+ false, config_.isUsingPosixShm()))
.addr,
compressor_,
[this](Item* it) -> ItemHandle { return acquire(it); })),
@@ -76,48 +125,55 @@ CacheAllocator::CacheAllocator(SharedMemNewT, Config config)
AccessContainer::getRequiredSize(
config_.chainedItemAccessConfig.getNumBuckets()),
nullptr,
- ShmSegmentOpts(config_.accessConfig.getPageSize()))
+ ShmSegmentOpts(config_.accessConfig.getPageSize(),
+ false, config_.isUsingPosixShm()))
.addr,
compressor_,
[this](Item* it) -> ItemHandle { return acquire(it); })),
chainedItemLocks_(config_.chainedItemsLockPower,
std::make_shared()),
- cacheCreationTime_{util::getCurrentTimeSec()},
- nvmCacheState_{config_.cacheDir, config_.isNvmCacheEncryptionEnabled(),
- config_.isNvmCacheTruncateAllocSizeEnabled()} {
+ movesMap_(kShards),
+ moveLock_(kShards),
+ cacheCreationTime_{util::getCurrentTimeSec()} {
initCommon(false);
- shmManager_->removeShm(detail::kShmInfoName);
+ shmManager_->removeShm(detail::kShmInfoName,
+ PosixSysVSegmentOpts(config_.isUsingPosixShm()));
}
template
CacheAllocator::CacheAllocator(SharedMemAttachT, Config config)
- : isOnShm_{true},
+ : CacheBase(config.getMemoryTierConfigs().size()),
+ memoryTierConfigs(config.getMemoryTierConfigs()),
+ isOnShm_{true},
config_(config.validate()),
shmManager_(
std::make_unique(config_.cacheDir, config_.usePosixShm)),
deserializer_(createDeserializer()),
metadata_{deserializeCacheAllocatorMetadata(*deserializer_)},
- allocator_(restoreMemoryAllocator()),
- compactCacheManager_(restoreCCacheManager()),
+ allocator_(restoreAllocators()),
+ compactCacheManager_(restoreCCacheManager(0 /* TODO - per tier */)),
compressor_(createPtrCompressor()),
mmContainers_(deserializeMMContainers(*deserializer_, compressor_)),
accessContainer_(std::make_unique(
deserializer_->deserialize(),
config_.accessConfig,
- shmManager_->attachShm(detail::kShmHashTableName),
+ shmManager_->attachShm(detail::kShmHashTableName, nullptr,
+ ShmSegmentOpts(PageSizeT::NORMAL, false, config_.isUsingPosixShm())),
compressor_,
[this](Item* it) -> ItemHandle { return acquire(it); })),
chainedItemAccessContainer_(std::make_unique(
deserializer_->deserialize(),
config_.chainedItemAccessConfig,
- shmManager_->attachShm(detail::kShmChainedItemHashTableName),
+ shmManager_->attachShm(detail::kShmChainedItemHashTableName, nullptr,
+ ShmSegmentOpts(PageSizeT::NORMAL, false, config_.isUsingPosixShm())),
compressor_,
[this](Item* it) -> ItemHandle { return acquire(it); })),
chainedItemLocks_(config_.chainedItemsLockPower,
std::make_shared()),
- cacheCreationTime_{*metadata_.cacheCreationTime_ref()},
- nvmCacheState_{config_.cacheDir, config_.isNvmCacheEncryptionEnabled(),
- config_.isNvmCacheTruncateAllocSizeEnabled()} {
+ movesMap_(kShards),
+ moveLock_(kShards),
+ cacheCreationTime_{*metadata_.cacheCreationTime_ref()} {
+ /* TODO - per tier? */
for (auto pid : *metadata_.compactCachePools_ref()) {
isCompactCachePool_[pid] = true;
}
@@ -127,7 +183,8 @@ CacheAllocator::CacheAllocator(SharedMemAttachT, Config config)
// We will create a new info shm segment on shutDown(). If we don't remove
// this info shm segment here and the new info shm segment's size is larger
// than this one, creating new one will fail.
- shmManager_->removeShm(detail::kShmInfoName);
+ shmManager_->removeShm(detail::kShmInfoName,
+ PosixSysVSegmentOpts(config_.isUsingPosixShm()));
}
template
@@ -141,44 +198,65 @@ CacheAllocator::~CacheAllocator() {
}
template
-std::unique_ptr
-CacheAllocator::createNewMemoryAllocator() {
+ShmSegmentOpts CacheAllocator::createShmCacheOpts(TierId tid) {
ShmSegmentOpts opts;
opts.alignment = sizeof(Slab);
+ opts.typeOpts = memoryTierConfigs[tid].getShmTypeOpts();
+ if (auto *v = std::get_if(&opts.typeOpts)) {
+ v->usePosix = config_.usePosixShm;
+ }
+
+ return opts;
+}
+
+template
+size_t CacheAllocator::memoryTierSize(TierId tid) const
+{
+ auto partitions = std::accumulate(memoryTierConfigs.begin(), memoryTierConfigs.end(), 0UL,
+ [](const size_t i, const MemoryTierCacheConfig& config){
+ return i + config.getRatio();
+ });
+
+ return memoryTierConfigs[tid].calculateTierSize(config_.getCacheSize(), partitions);
+}
+
+template
+std::unique_ptr
+CacheAllocator::createNewMemoryAllocator(TierId tid) {
return std::make_unique(
getAllocatorConfig(config_),
shmManager_
- ->createShm(detail::kShmCacheName, config_.size,
- config_.slabMemoryBaseAddr, opts)
+ ->createShm(detail::kShmCacheName + std::to_string(tid),
+ config_.getCacheSize(), config_.slabMemoryBaseAddr,
+ createShmCacheOpts(tid))
.addr,
- config_.size);
+ memoryTierSize(tid)
+ );
}
template
std::unique_ptr
-CacheAllocator::restoreMemoryAllocator() {
- ShmSegmentOpts opts;
- opts.alignment = sizeof(Slab);
+CacheAllocator::restoreMemoryAllocator(TierId tid) {
return std::make_unique(
deserializer_->deserialize(),
shmManager_
- ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr, opts)
- .addr,
- config_.size,
+ ->attachShm(detail::kShmCacheName + std::to_string(tid),
+ config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr,
+ memoryTierSize(tid),
config_.disableFullCoredump);
}
template
std::unique_ptr
-CacheAllocator::restoreCCacheManager() {
+CacheAllocator::restoreCCacheManager(TierId tid) {
return std::make_unique(
deserializer_->deserialize(),
- *allocator_);
+ *allocator_[tid]);
}
template
void CacheAllocator::initCommon(bool dramCacheAttached) {
- if (config_.nvmConfig.has_value()) {
+ if (config_.isNvmCacheEnabled()) {
if (config_.nvmCacheAP) {
nvmAdmissionPolicy_ = config_.nvmCacheAP;
} else if (config_.rejectFirstAPNumEntries) {
@@ -201,25 +279,28 @@ void CacheAllocator::initCommon(bool dramCacheAttached) {
template
void CacheAllocator::initNvmCache(bool dramCacheAttached) {
- if (!config_.nvmConfig.has_value()) {
+ if (!config_.isNvmCacheEnabled()) {
return;
}
+ nvmCacheState_.emplace(NvmCacheState(config_.cacheDir, config_.isNvmCacheEncryptionEnabled(),
+ config_.isNvmCacheTruncateAllocSizeEnabled()));
+
// for some usecases that create pools, restoring nvmcache when dram cache
// is not persisted is not supported.
const bool shouldDrop = config_.dropNvmCacheOnShmNew && !dramCacheAttached;
// if we are dealing with persistency, cache directory should be enabled
const bool truncate = config_.cacheDir.empty() ||
- nvmCacheState_.shouldStartFresh() || shouldDrop;
+ nvmCacheState_.value().shouldStartFresh() || shouldDrop;
if (truncate) {
- nvmCacheState_.markTruncated();
+ nvmCacheState_.value().markTruncated();
}
nvmCache_ = std::make_unique(*this, *config_.nvmConfig, truncate,
config_.itemDestructor);
if (!config_.cacheDir.empty()) {
- nvmCacheState_.clearPrevState();
+ nvmCacheState_.value().clearPrevState();
}
}
@@ -259,11 +340,24 @@ void CacheAllocator::initWorkers() {
config_.poolOptimizeStrategy,
config_.ccacheOptimizeStepSizePercent);
}
+
+ if (config_.backgroundEvictorEnabled()) {
+ startNewBackgroundEvictor(config_.backgroundEvictorInterval,
+ config_.backgroundEvictorStrategy,
+ config_.backgroundEvictorThreads);
+ }
+
+ if (config_.backgroundPromoterEnabled()) {
+ startNewBackgroundPromoter(config_.backgroundPromoterInterval,
+ config_.backgroundPromoterStrategy,
+ config_.backgroundPromoterThreads);
+ }
}
template
std::unique_ptr CacheAllocator::createDeserializer() {
- auto infoAddr = shmManager_->attachShm(detail::kShmInfoName);
+ auto infoAddr = shmManager_->attachShm(detail::kShmInfoName, nullptr,
+ ShmSegmentOpts(PageSizeT::NORMAL, false, config_.isUsingPosixShm()));
return std::make_unique(
reinterpret_cast(infoAddr.addr),
reinterpret_cast(infoAddr.addr) + infoAddr.size);
@@ -280,16 +374,35 @@ CacheAllocator::allocate(PoolId poolId,
creationTime = util::getCurrentTimeSec();
}
return allocateInternal(poolId, key, size, creationTime,
- ttlSecs == 0 ? 0 : creationTime + ttlSecs);
+ ttlSecs == 0 ? 0 : creationTime + ttlSecs, false);
+}
+
+template
+bool CacheAllocator::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid)
+{
+ // TODO: should we also work on lower tiers? should we have separate set of params?
+ if (tid == 1) return false;
+ return getAllocationClassStats(tid, pid, cid).approxFreePercent <= config_.lowEvictionAcWatermark;
+}
+
+template
+size_t CacheAllocator::backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers)
+{
+ XDCHECK(numWorkers);
+
+ // TODO: came up with some better sharding (use some hashing)
+ return (tid + pid + cid) % numWorkers;
}
template
typename CacheAllocator::ItemHandle
-CacheAllocator::allocateInternal(PoolId pid,
+CacheAllocator::allocateInternalTier(TierId tid,
+ PoolId pid,
typename Item::Key key,
uint32_t size,
uint32_t creationTime,
- uint32_t expiryTime) {
+ uint32_t expiryTime,
+ bool fromEvictorThread) {
util::LatencyTracker tracker{stats().allocateLatency_};
SCOPE_FAIL { stats_.invalidAllocs.inc(); };
@@ -298,16 +411,37 @@ CacheAllocator::allocateInternal(PoolId pid,
const auto requiredSize = Item::getRequiredSize(key, size);
// the allocation class in our memory allocator.
- const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+ const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+ util::RollingLatencyTracker rollTracker{(*stats_.classAllocLatency)[tid][pid][cid]};
+ // TODO: per-tier
(*stats_.allocAttempts)[pid][cid].inc();
- void* memory = allocator_->allocate(pid, requiredSize);
+ void *memory = nullptr;
+
+ if (tid == 0 && config_.acTopTierEvictionWatermark > 0.0
+ && getAllocationClassStats(tid, pid, cid)
+ .approxFreePercent < config_.acTopTierEvictionWatermark) {
+ memory = findEviction(tid, pid, cid);
+ }
+
+ if (memory == nullptr) {
+ // TODO: should we try allocate item even if this will result in violating
+ // acTopTierEvictionWatermark?
+ memory = allocator_[tid]->allocate(pid, requiredSize);
+ }
+
+ if (backgroundEvictor_.size() && !fromEvictorThread && (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) {
+ backgroundEvictor_[backgroundWorkerId(tid, pid, cid, backgroundEvictor_.size())]->wakeUp();
+ }
+
+ // TODO: Today disableEviction means do not evict from memory (DRAM).
+ // Should we support eviction between memory tiers (e.g. from DRAM to PMEM)?
if (memory == nullptr && !config_.disableEviction) {
- memory = findEviction(pid, cid);
+ memory = findEviction(tid, pid, cid);
}
- ItemHandle handle;
+ WriteHandle handle;
if (memory != nullptr) {
// At this point, we have a valid memory allocation that is ready for use.
// Ensure that when we abort from here under any circumstances, we free up
@@ -315,7 +449,7 @@ CacheAllocator::allocateInternal(PoolId pid,
// for example.
SCOPE_FAIL {
// free back the memory to the allocator since we failed.
- allocator_->free(memory);
+ allocator_[tid]->free(memory);
};
handle = acquire(new (memory) Item(key, size, creationTime, expiryTime));
@@ -326,7 +460,7 @@ CacheAllocator::allocateInternal(PoolId pid,
}
} else { // failed to allocate memory.
- (*stats_.allocFailures)[pid][cid].inc();
+ (*stats_.allocFailures)[pid][cid].inc(); // TODO: per-tier
// wake up rebalancer
if (poolRebalancer_) {
poolRebalancer_->wakeUp();
@@ -343,6 +477,74 @@ CacheAllocator::allocateInternal(PoolId pid,
return handle;
}
+template
+TierId
+CacheAllocator::getTargetTierForItem(PoolId pid,
+ typename Item::Key key,
+ uint32_t size,
+ uint32_t creationTime,
+ uint32_t expiryTime) {
+ if (numTiers_ == 1)
+ return 0;
+
+ if (config_.forceAllocationTier != UINT64_MAX) {
+ return config_.forceAllocationTier;
+ }
+
+ const TierId defaultTargetTier = 0;
+
+ const auto requiredSize = Item::getRequiredSize(key, size);
+ const auto cid = allocator_[defaultTargetTier]->getAllocationClassId(pid, requiredSize);
+
+ auto freePercentage = getAllocationClassStats(defaultTargetTier, pid, cid).approxFreePercent;
+
+ // TODO: COULD we implement BG worker which would move slabs around
+ // so that there is similar amount of free space in each pool/ac.
+ // Should this be responsibility of BG evictor?
+
+ if (freePercentage >= config_.maxAcAllocationWatermark)
+ return defaultTargetTier;
+
+ if (freePercentage <= config_.minAcAllocationWatermark)
+ return defaultTargetTier + 1;
+
+ // TODO: we can even think about creating different allocation classes for PMEM
+ // and we could look at possible fragmentation when deciding where to put the item
+ if (config_.sizeThresholdPolicy)
+ return requiredSize < config_.sizeThresholdPolicy ? defaultTargetTier : defaultTargetTier + 1;
+
+ // TODO: (e.g. always put chained items to PMEM)
+ // if (chainedItemsPolicy)
+ // return item.isChainedItem() ? defaultTargetTier + 1 : defaultTargetTier;
+
+ // TODO:
+ // if (expiryTimePolicy)
+ // return (expiryTime - creationTime) < expiryTimePolicy ? defaultTargetTier : defaultTargetTier + 1;
+
+ // TODO:
+ // if (keyPolicy) // this can be based on key length or some other properties
+ // return getTargetTierForKey(key);
+
+ // TODO:
+ // if (compressabilityPolicy) // if compresses well store in PMEM? latency will be higher anyway
+ // return TODO;
+
+ // TODO: only works for 2 tiers
+ return (folly::Random::rand32() % 100) < config_.defaultTierChancePercentage ? defaultTargetTier : defaultTargetTier + 1;
+}
+
+template
+typename CacheAllocator::WriteHandle
+CacheAllocator::allocateInternal(PoolId pid,
+ typename Item::Key key,
+ uint32_t size,
+ uint32_t creationTime,
+ uint32_t expiryTime,
+ bool fromEvictorThread) {
+ auto tid = getTargetTierForItem(pid, key, size, creationTime, expiryTime);
+ return allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromEvictorThread);
+}
+
template
typename CacheAllocator::WriteHandle
CacheAllocator::allocateChainedItem(const ReadHandle& parent,
@@ -373,21 +575,28 @@ CacheAllocator::allocateChainedItemInternal(
// number of bytes required for this item
const auto requiredSize = ChainedItem::getRequiredSize(size);
- const auto pid = allocator_->getAllocInfo(parent->getMemory()).poolId;
- const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+ // TODO: is this correct?
+ auto tid = getTierId(*parent);
+
+ const auto pid = allocator_[tid]->getAllocInfo(parent->getMemory()).poolId;
+ const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+ util::RollingLatencyTracker rollTracker{(*stats_.classAllocLatency)[tid][pid][cid]};
+
+ // TODO: per-tier? Right now stats_ are not used in any public periodic
+ // worker
(*stats_.allocAttempts)[pid][cid].inc();
- void* memory = allocator_->allocate(pid, requiredSize);
+ void* memory = allocator_[tid]->allocate(pid, requiredSize);
if (memory == nullptr) {
- memory = findEviction(pid, cid);
+ memory = findEviction(tid, pid, cid);
}
if (memory == nullptr) {
(*stats_.allocFailures)[pid][cid].inc();
return ItemHandle{};
}
- SCOPE_FAIL { allocator_->free(memory); };
+ SCOPE_FAIL { allocator_[tid]->free(memory); };
auto child = acquire(
new (memory) ChainedItem(compressor_.compress(parent.getInternal()), size,
@@ -696,8 +905,8 @@ CacheAllocator::releaseBackToAllocator(Item& it,
throw std::runtime_error(
folly::sformat("cannot release this item: {}", it.toString()));
}
-
- const auto allocInfo = allocator_->getAllocInfo(it.getMemory());
+ const auto tid = getTierId(it);
+ const auto allocInfo = allocator_[tid]->getAllocInfo(it.getMemory());
if (ctx == RemoveContext::kEviction) {
const auto timeNow = util::getCurrentTimeSec();
@@ -721,8 +930,7 @@ CacheAllocator::releaseBackToAllocator(Item& it,
folly::sformat("Can not recycle a chained item {}, toRecyle",
it.toString(), toRecycle->toString()));
}
-
- allocator_->free(&it);
+ allocator_[tid]->free(&it);
return ReleaseRes::kReleased;
}
@@ -781,7 +989,7 @@ CacheAllocator::releaseBackToAllocator(Item& it,
auto next = head->getNext(compressor_);
const auto childInfo =
- allocator_->getAllocInfo(static_cast(head));
+ allocator_[tid]->getAllocInfo(static_cast(head));
(*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub(
util::getFragmentation(*this, *head));
@@ -814,7 +1022,7 @@ CacheAllocator::releaseBackToAllocator(Item& it,
XDCHECK(ReleaseRes::kReleased != res);
res = ReleaseRes::kRecycled;
} else {
- allocator_->free(head);
+ allocator_[tid]->free(head);
}
}
@@ -829,7 +1037,7 @@ CacheAllocator::releaseBackToAllocator(Item& it,
res = ReleaseRes::kRecycled;
} else {
XDCHECK(it.isDrained());
- allocator_->free(&it);
+ allocator_[tid]->free(&it);
}
return res;
@@ -901,6 +1109,25 @@ bool CacheAllocator::replaceInMMContainer(Item& oldItem,
}
}
+template
+bool CacheAllocator::replaceInMMContainer(Item* oldItem,
+ Item& newItem) {
+ return replaceInMMContainer(*oldItem, newItem);
+}
+
+template
+bool CacheAllocator::replaceInMMContainer(EvictionIterator& oldItemIt,
+ Item& newItem) {
+ auto& oldContainer = getMMContainer(*oldItemIt);
+ auto& newContainer = getMMContainer(newItem);
+
+ // This function is used for eviction across tiers
+ XDCHECK(&oldContainer != &newContainer);
+ oldContainer.remove(oldItemIt);
+
+ return newContainer.add(newItem);
+}
+
template
bool CacheAllocator::replaceChainedItemInMMContainer(
Item& oldItem, Item& newItem) {
@@ -1045,6 +1272,157 @@ CacheAllocator::insertOrReplace(const ItemHandle& handle) {
return replaced;
}
+/* Next two methods are used to asynchronously move Item between memory tiers.
+ *
+ * The thread, which moves Item, allocates new Item in the tier we are moving to
+ * and calls moveRegularItemwithSync() method. This method does the following:
+ * 1. Create MoveCtx and put it to the movesMap.
+ * 2. Update the access container with the new item from the tier we are
+ * moving to. This Item has kIncomplete flag set.
+ * 3. Copy data from the old Item to the new one.
+ * 4. Unset the kIncomplete flag and Notify MoveCtx
+ *
+ * Concurrent threads which are getting handle to the same key:
+ * 1. When a handle is created it checks if the kIncomplete flag is set
+ * 2. If so, Handle implementation creates waitContext and adds it to the
+ * MoveCtx by calling addWaitContextForMovingItem() method.
+ * 3. Wait until the moving thread will complete its job.
+ */
+template
+bool CacheAllocator::addWaitContextForMovingItem(
+ folly::StringPiece key, std::shared_ptr> waiter) {
+ auto shard = getShardForKey(key);
+ auto& movesMap = getMoveMapForShard(shard);
+ auto lock = getMoveLockForShard(shard);
+ auto it = movesMap.find(key);
+ if (it == movesMap.end()) {
+ return false;
+ }
+ auto ctx = it->second.get();
+ ctx->addWaiter(std::move(waiter));
+ return true;
+}
+
+template
+template
+typename CacheAllocator::ItemHandle
+CacheAllocator::moveRegularItemwithSync(
+ Item& oldItem, ItemHandle& newItemHdl, P&& predicate) {
+ XDCHECK(oldItem.isMoving());
+ // TODO: should we introduce new latency tracker. E.g. evictRegularLatency_
+ // ??? util::LatencyTracker tracker{stats_.evictRegularLatency_};
+
+ if (!oldItem.isAccessible() || oldItem.isExpired()) {
+ return {};
+ }
+
+ XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize());
+ XDCHECK_NE(getTierId(oldItem), getTierId(*newItemHdl));
+
+ // take care of the flags before we expose the item to be accessed. this
+ // will ensure that when another thread removes the item from RAM, we issue
+ // a delete accordingly. See D7859775 for an example
+ if (oldItem.isNvmClean()) {
+ newItemHdl->markNvmClean();
+ }
+
+ folly::StringPiece key(oldItem.getKey());
+ auto shard = getShardForKey(key);
+ auto& movesMap = getMoveMapForShard(shard);
+ MoveCtx* ctx(nullptr);
+ {
+ auto lock = getMoveLockForShard(shard);
+ auto res = movesMap.try_emplace(key, std::make_unique());
+ if (!res.second) {
+ return {};
+ }
+ ctx = res.first->second.get();
+ }
+
+ auto resHdl = ItemHandle{};
+ auto guard = folly::makeGuard([key, this, ctx, shard, &resHdl]() {
+ auto& movesMap = getMoveMapForShard(shard);
+ if (resHdl)
+ resHdl->unmarkIncomplete();
+ auto lock = getMoveLockForShard(shard);
+ ctx->setItemHandle(std::move(resHdl));
+ movesMap.erase(key);
+ });
+
+ // TODO: Possibly we can use markMoving() instead. But today
+ // moveOnSlabRelease logic assume that we mark as moving old Item
+ // and than do copy and replace old Item with the new one in access
+ // container. Furthermore, Item can be marked as Moving only
+ // if it is linked to MM container. In our case we mark the new Item
+ // and update access container before the new Item is ready (content is
+ // copied).
+ newItemHdl->markIncomplete();
+
+ // Inside the access container's lock, this checks if the old item is
+ // accessible and its refcount is zero. If the item is not accessible,
+ // there is no point to replace it since it had already been removed
+ // or in the process of being removed. If the item is in cache but the
+ // refcount is non-zero, it means user could be attempting to remove
+ // this item through an API such as remove(ItemHandle). In this case,
+ // it is unsafe to replace the old item with a new one, so we should
+ // also abort.
+ if (!accessContainer_->replaceIf(oldItem, *newItemHdl,
+ predicate)) {
+ return {};
+ }
+
+ if (config_.moveCb) {
+ // Execute the move callback. We cannot make any guarantees about the
+ // consistency of the old item beyond this point, because the callback can
+ // do more than a simple memcpy() e.g. update external references. If there
+ // are any remaining handles to the old item, it is the caller's
+ // responsibility to invalidate them. The move can only fail after this
+ // statement if the old item has been removed or replaced, in which case it
+ // should be fine for it to be left in an inconsistent state.
+ config_.moveCb(oldItem, *newItemHdl, nullptr);
+ } else {
+ std::memcpy(newItemHdl->getWritableMemory(), oldItem.getMemory(),
+ oldItem.getSize());
+ }
+
+ // Inside the MM container's lock, this checks if the old item exists to
+ // make sure that no other thread removed it, and only then replaces it.
+ if (!replaceInMMContainer(oldItem, *newItemHdl)) {
+ accessContainer_->remove(*newItemHdl);
+ return {};
+ }
+
+ // Replacing into the MM container was successful, but someone could have
+ // called insertOrReplace() or remove() before or after the
+ // replaceInMMContainer() operation, which would invalidate newItemHdl.
+ if (!newItemHdl->isAccessible()) {
+ removeFromMMContainer(*newItemHdl);
+ return {};
+ }
+
+ // no one can add or remove chained items at this point
+ if (oldItem.hasChainedItem()) {
+ // safe to acquire handle for a moving Item
+ auto oldHandle = acquire(&oldItem);
+ XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString();
+ XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString();
+ try {
+ auto l = chainedItemLocks_.lockExclusive(oldItem.getKey());
+ transferChainLocked(oldHandle, newItemHdl);
+ } catch (const std::exception& e) {
+ // this should never happen because we drained all the handles.
+ XLOGF(DFATAL, "{}", e.what());
+ throw;
+ }
+
+ XDCHECK(!oldItem.hasChainedItem());
+ XDCHECK(newItemHdl->hasChainedItem());
+ }
+ newItemHdl.unmarkNascent();
+ resHdl = std::move(newItemHdl); // guard will assign it to ctx under lock
+ return acquire(&oldItem);
+}
+
template
bool CacheAllocator::moveRegularItem(Item& oldItem,
ItemHandle& newItemHdl) {
@@ -1187,41 +1565,70 @@ bool CacheAllocator::moveChainedItem(ChainedItem& oldItem,
template
typename CacheAllocator::Item*
-CacheAllocator::findEviction(PoolId pid, ClassId cid) {
- auto& mmContainer = getMMContainer(pid, cid);
+CacheAllocator::findEviction(TierId tid, PoolId pid, ClassId cid) {
+ auto& mmContainer = getMMContainer(tid, pid, cid);
// Keep searching for a candidate until we were able to evict it
// or until the search limit has been exhausted
unsigned int searchTries = 0;
- auto itr = mmContainer.getEvictionIterator();
while ((config_.evictionSearchTries == 0 ||
- config_.evictionSearchTries > searchTries) &&
- itr) {
+ config_.evictionSearchTries > searchTries)) {
++searchTries;
- Item* candidate = itr.get();
+ Item* toRecycle = nullptr;
+ Item* candidate = nullptr;
+
+ mmContainer.withEvictionIterator([this, &candidate, &toRecycle, &searchTries](auto &&itr){
+ while ((config_.evictionSearchTries == 0 ||
+ config_.evictionSearchTries > searchTries) && itr) {
+ ++searchTries;
+
+ auto *toRecycle_ = itr.get();
+ auto *candidate_ = toRecycle_->isChainedItem()
+ ? &toRecycle_->asChainedItem().getParentItem(compressor_)
+ : toRecycle_;
+
+ // make sure no other thead is evicting the item
+ if (candidate_->getRefCount() == 0 && candidate_->markMoving()) {
+ toRecycle = toRecycle_;
+ candidate = candidate_;
+ return;
+ }
+
+ ++itr;
+ }
+ });
+
+ if (!toRecycle)
+ continue;
+
+ XDCHECK(toRecycle);
+ XDCHECK(candidate);
+
// for chained items, the ownership of the parent can change. We try to
// evict what we think as parent and see if the eviction of parent
// recycles the child we intend to.
auto toReleaseHandle =
- itr->isChainedItem()
- ? advanceIteratorAndTryEvictChainedItem(itr)
- : advanceIteratorAndTryEvictRegularItem(mmContainer, itr);
+ evictNormalItem(*candidate, true /* skipIfTokenInvalid */);
+ auto ref = candidate->unmarkMoving();
- if (toReleaseHandle) {
- if (toReleaseHandle->hasChainedItem()) {
+ if (toReleaseHandle || ref == 0u) {
+ if (candidate->hasChainedItem()) {
(*stats_.chainedItemEvictions)[pid][cid].inc();
} else {
(*stats_.regularItemEvictions)[pid][cid].inc();
}
+ } else {
+ if (candidate->hasChainedItem()) {
+ stats_.evictFailParentAC.inc();
+ } else {
+ stats_.evictFailAC.inc();
+ }
+ }
- // Invalidate iterator since later on we may use this mmContainer
- // again, which cannot be done unless we drop this iterator
- itr.destroy();
-
- // we must be the last handle and for chained items, this will be
- // the parent.
- XDCHECK(toReleaseHandle.get() == candidate || candidate->isChainedItem());
+ if (toReleaseHandle) {
+ XDCHECK(toReleaseHandle.get() == candidate);
+ XDCHECK(toRecycle == candidate || toRecycle->isChainedItem());
XDCHECK_EQ(1u, toReleaseHandle->getRefCount());
// We manually release the item here because we don't want to
@@ -1237,15 +1644,18 @@ CacheAllocator::findEviction(PoolId pid, ClassId cid) {
// recycle the candidate.
if (ReleaseRes::kRecycled ==
releaseBackToAllocator(itemToRelease, RemoveContext::kEviction,
- /* isNascent */ false, candidate)) {
- return candidate;
+ /* isNascent */ false, toRecycle)) {
+ return toRecycle;
+ }
+ } else if (ref == 0u) {
+ // it's safe to recycle the item here as there are no more
+ // references and the item could not been marked as moving
+ // by other thread since it's detached from MMContainer.
+ if (ReleaseRes::kRecycled ==
+ releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+ /* isNascent */ false, toRecycle)) {
+ return toRecycle;
}
- }
-
- // If we destroyed the itr to possibly evict and failed, we restart
- // from the beginning again
- if (!itr) {
- itr.resetToBegin();
}
}
return nullptr;
@@ -1300,140 +1710,89 @@ bool CacheAllocator::shouldWriteToNvmCacheExclusive(
}
template
-typename CacheAllocator::ItemHandle
-CacheAllocator::advanceIteratorAndTryEvictRegularItem(
- MMContainer& mmContainer, EvictionIterator& itr) {
- // we should flush this to nvmcache if it is not already present in nvmcache
- // and the item is not expired.
- Item& item = *itr;
- const bool evictToNvmCache = shouldWriteToNvmCache(item);
-
- auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey())
- : typename NvmCacheT::PutToken{};
- // record the in-flight eviciton. If not, we move on to next item to avoid
- // stalling eviction.
- if (evictToNvmCache && !token.isValid()) {
- ++itr;
- stats_.evictFailConcurrentFill.inc();
- return ItemHandle{};
- }
-
- // If there are other accessors, we should abort. Acquire a handle here since
- // if we remove the item from both access containers and mm containers
- // below, we will need a handle to ensure proper cleanup in case we end up
- // not evicting this item
- auto evictHandle = accessContainer_->removeIf(item, &itemEvictionPredicate);
-
- if (!evictHandle) {
- ++itr;
- stats_.evictFailAC.inc();
- return evictHandle;
- }
-
- mmContainer.remove(itr);
- XDCHECK_EQ(reinterpret_cast(evictHandle.get()),
- reinterpret_cast(&item));
- XDCHECK(!evictHandle->isInMMContainer());
- XDCHECK(!evictHandle->isAccessible());
-
- // If the item is now marked as moving, that means its corresponding slab is
- // being released right now. So, we look for the next item that is eligible
- // for eviction. It is safe to destroy the handle here since the moving bit
- // is set. Iterator was already advance by the remove call above.
- if (evictHandle->isMoving()) {
- stats_.evictFailMove.inc();
- return ItemHandle{};
- }
-
- // Invalidate iterator since later on if we are not evicting this
- // item, we may need to rely on the handle we created above to ensure
- // proper cleanup if the item's raw refcount has dropped to 0.
- // And since this item may be a parent item that has some child items
- // in this very same mmContainer, we need to make sure we drop this
- // exclusive iterator so we can gain access to it when we're cleaning
- // up the child items
- itr.destroy();
-
- // Ensure that there are no accessors after removing from the access
- // container
- XDCHECK(evictHandle->getRefCount() == 1);
+bool CacheAllocator::shouldEvictToNextMemoryTier(
+ TierId sourceTierId, TierId targetTierId, PoolId pid, Item& item)
+{
+ if (config_.disableEvictionToMemory)
+ return false;
- if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) {
- XDCHECK(token.isValid());
- nvmCache_->put(evictHandle, std::move(token));
- }
- return evictHandle;
+ // TODO: implement more advanced admission policies for memory tiers
+ return true;
}
template
-typename CacheAllocator::ItemHandle
-CacheAllocator::advanceIteratorAndTryEvictChainedItem(
- EvictionIterator& itr) {
- XDCHECK(itr->isChainedItem());
-
- ChainedItem* candidate = &itr->asChainedItem();
- ++itr;
-
- // The parent could change at any point through transferChain. However, if
- // that happens, we would realize that the releaseBackToAllocator return
- // kNotRecycled and we would try another chained item, leading to transient
- // failure.
- auto& parent = candidate->getParentItem(compressor_);
-
- const bool evictToNvmCache = shouldWriteToNvmCache(parent);
+typename CacheAllocator::WriteHandle
+CacheAllocator::tryEvictToNextMemoryTier(
+ TierId tid, PoolId pid, Item& item, bool fromEvictorThread) {
+ if(item.isExpired()) return acquire(&item);
- auto token = evictToNvmCache ? nvmCache_->createPutToken(parent.getKey())
- : typename NvmCacheT::PutToken{};
+ TierId nextTier = tid;
+ while (++nextTier < numTiers_) { // try to evict down to the next memory tiers
+ if (!shouldEvictToNextMemoryTier(tid, nextTier, pid, item))
+ continue;
- // if token is invalid, return. iterator is already advanced.
- if (evictToNvmCache && !token.isValid()) {
- stats_.evictFailConcurrentFill.inc();
- return ItemHandle{};
- }
+ // allocateInternal might trigger another eviction
+ auto newItemHdl = allocateInternalTier(nextTier, pid,
+ item.getKey(),
+ item.getSize(),
+ item.getCreationTime(),
+ item.getExpiryTime(),
+ fromEvictorThread);
- // check if the parent exists in the hashtable and refcount is drained.
- auto parentHandle =
- accessContainer_->removeIf(parent, &itemEvictionPredicate);
- if (!parentHandle) {
- stats_.evictFailParentAC.inc();
- return parentHandle;
+ if (newItemHdl) {
+ XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+ return moveRegularItemwithSync(item, newItemHdl, itemMovingPredicate);
+ }
}
- // Invalidate iterator since later on we may use the mmContainer
- // associated with this iterator which cannot be done unless we
- // drop this iterator
- //
- // This must be done once we know the parent is not nullptr.
- // Since we can very well be the last holder of this parent item,
- // which may have a chained item that is linked in this MM container.
- itr.destroy();
-
- // Ensure we have the correct parent and we're the only user of the
- // parent, then free it from access container. Otherwise, we abort
- XDCHECK_EQ(reinterpret_cast(&parent),
- reinterpret_cast(parentHandle.get()));
- XDCHECK_EQ(1u, parent.getRefCount());
+ return {};
+}
- removeFromMMContainer(*parentHandle);
+template
+bool
+CacheAllocator::tryPromoteToNextMemoryTier(
+ TierId tid, PoolId pid, Item& item, bool fromEvictorThread) {
+ TierId nextTier = tid;
+ while (nextTier > 0) { // try to evict down to the next memory tiers
+ auto toPromoteTier = nextTier - 1;
+ --nextTier;
- XDCHECK(!parent.isInMMContainer());
- XDCHECK(!parent.isAccessible());
+ // allocateInternal might trigger another eviction
+ auto newItemHdl = allocateInternalTier(toPromoteTier, pid,
+ item.getKey(),
+ item.getSize(),
+ item.getCreationTime(),
+ item.getExpiryTime(),
+ fromEvictorThread);
- // We need to make sure the parent is not marked as moving
- // and we're the only holder of the parent item. Safe to destroy the handle
- // here since moving bit is set.
- if (parentHandle->isMoving()) {
- stats_.evictFailParentMove.inc();
- return ItemHandle{};
+ if (newItemHdl) {
+ XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+ auto predicate = [&](const Item& item){
+ return item.getRefCount() == 0 || config_.numDuplicateElements > 0;
+ };
+ if (moveRegularItemwithSync(item, newItemHdl, predicate)) {
+ return true;
+ }
+ }
}
- if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) {
- XDCHECK(token.isValid());
- XDCHECK(parentHandle->hasChainedItem());
- nvmCache_->put(parentHandle, std::move(token));
- }
+ return false;
+}
- return parentHandle;
+template
+typename CacheAllocator::WriteHandle
+CacheAllocator::tryEvictToNextMemoryTier(Item& item, bool fromEvictorThread) {
+ auto tid = getTierId(item);
+ auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+ return tryEvictToNextMemoryTier(tid, pid, item, fromEvictorThread);
+}
+
+template
+bool
+CacheAllocator::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) {
+ auto tid = getTierId(item);
+ auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+ return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread);
}
template
@@ -1631,21 +1990,41 @@ void CacheAllocator::invalidateNvm(Item& item) {
}
}
+template
+TierId
+CacheAllocator::getTierId(const Item& item) const {
+ return getTierId(item.getMemory());
+}
+
+template
+TierId
+CacheAllocator::getTierId(const void* ptr) const {
+ for (TierId tid = 0; tid < numTiers_; tid++) {
+ if (allocator_[tid]->isMemoryInAllocator(ptr))
+ return tid;
+ }
+
+ throw std::invalid_argument("Item does not belong to any tier!");
+}
+
template
typename CacheAllocator::MMContainer&
CacheAllocator::getMMContainer(const Item& item) const noexcept {
+ const auto tid = getTierId(item);
const auto allocInfo =
- allocator_->getAllocInfo(static_cast(&item));
- return getMMContainer(allocInfo.poolId, allocInfo.classId);
+ allocator_[tid]->getAllocInfo(static_cast(&item));
+ return getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
}
template
typename CacheAllocator::MMContainer&
-CacheAllocator::getMMContainer(PoolId pid,
+CacheAllocator::getMMContainer(TierId tid,
+ PoolId pid,
ClassId cid) const noexcept {
- XDCHECK_LT(static_cast(pid), mmContainers_.size());
- XDCHECK_LT(static_cast(cid), mmContainers_[pid].size());
- return *mmContainers_[pid][cid];
+ XDCHECK_LT(static_cast(tid), mmContainers_.size());
+ XDCHECK_LT(static_cast(pid), mmContainers_[tid].size());
+ XDCHECK_LT(static_cast(cid), mmContainers_[tid][pid].size());
+ return *mmContainers_[tid][pid][cid];
}
template
@@ -1791,8 +2170,9 @@ void CacheAllocator::markUseful(const ItemHandle& handle,
template
bool CacheAllocator::recordAccessInMMContainer(Item& item,
AccessMode mode) {
+ const auto tid = getTierId(item);
const auto allocInfo =
- allocator_->getAllocInfo(static_cast(&item));
+ allocator_[tid]->getAllocInfo(static_cast(&item));
(*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc();
// track recently accessed items if needed
@@ -1800,14 +2180,15 @@ bool CacheAllocator::recordAccessInMMContainer(Item& item,
ring_->trackItem(reinterpret_cast(&item), item.getSize());
}
- auto& mmContainer = getMMContainer(allocInfo.poolId, allocInfo.classId);
+ auto& mmContainer = getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
return mmContainer.recordAccess(item, mode);
}
template
uint32_t CacheAllocator::getUsableSize(const Item& item) const {
+ const auto tid = getTierId(item);
const auto allocSize =
- allocator_->getAllocInfo(static_cast(&item)).allocSize;
+ allocator_[tid]->getAllocInfo(static_cast(&item)).allocSize;
return item.isChainedItem()
? allocSize - ChainedItem::getRequiredSize(0)
: allocSize - Item::getRequiredSize(item.getKey(), 0);
@@ -1816,8 +2197,11 @@ uint32_t CacheAllocator::getUsableSize(const Item& item) const {
template